[
  {
    "path": ".dev_scripts/check_installation.py",
    "content": "import numpy as np\nimport torch\n\nfrom mmcv.ops import box_iou_rotated\nfrom mmcv.utils import collect_env\n\n\ndef check_installation():\n    \"\"\"Check whether mmcv has been installed successfully.\"\"\"\n    np_boxes1 = np.asarray(\n        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],\n         [7.0, 7.0, 8.0, 8.0, 0.4]],\n        dtype=np.float32)\n    np_boxes2 = np.asarray(\n        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],\n         [5.0, 5.0, 6.0, 7.0, 0.4]],\n        dtype=np.float32)\n    boxes1 = torch.from_numpy(np_boxes1)\n    boxes2 = torch.from_numpy(np_boxes2)\n\n    # test mmcv with CPU ops\n    box_iou_rotated(boxes1, boxes2)\n    print('CPU ops were compiled successfully.')\n\n    # test mmcv with both CPU and CUDA ops\n    if torch.cuda.is_available():\n        boxes1 = boxes1.cuda()\n        boxes2 = boxes2.cuda()\n        box_iou_rotated(boxes1, boxes2)\n        print('CUDA ops were compiled successfully.')\n    else:\n        print('No CUDA runtime is found, skipping the checking of CUDA ops.')\n\n\nif __name__ == '__main__':\n    print('Start checking the installation of mmcv ...')\n    check_installation()\n    print('mmcv has been installed successfully.\\n')\n\n    env_info_dict = collect_env()\n    env_info = '\\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])\n    dash_line = '-' * 60 + '\\n'\n    print('Environment information:')\n    print(dash_line + env_info + '\\n' + dash_line)\n"
  },
  {
    "path": ".dockerignore",
    "content": ".git\n.gitignore\n*.egg-info\n.eggs/\n.mypy-cache\npip-wheel-metadata\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/1-bug-report.yml",
    "content": "name: \"🐞 Bug report\"\ndescription: \"Create a report to help us reproduce and fix the bug\"\nlabels: bug\ntitle: \"[Bug] \"\n\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        ## Note\n        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)\n        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**\n\n  - type: checkboxes\n    attributes:\n      label: Prerequisite\n      description: Please check the following items before creating a new issue.\n      options:\n      - label: I have searched [Issues](https://github.com/open-mmlab/mmcv/issues) and [Discussions](https://github.com/open-mmlab/mmcv/discussions) but cannot get the expected help.\n        required: true\n      - label: The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmcv).\n        required: true\n\n  - type: textarea\n    attributes:\n      label: Environment\n      description: |\n        Please run `python -c \"from mmcv.utils import collect_env; print(collect_env())\"` to collect necessary environment information and copy-paste it here.\n        You may add additional information that may be helpful for locating the problem, such as\n          - How you installed PyTorch \\[e.g., pip, conda, source\\]\n          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Reproduces the problem - code sample\n      description: |\n        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.\n      placeholder: |\n        ```python\n        # Sample code to reproduce the problem\n        ```\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Reproduces the problem - command or script\n      description: |\n        What command or script did you run?\n      placeholder: |\n        ```shell\n        The command or script you run.\n        ```\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Reproduces the problem - error message\n      description: |\n        Please provide the error message or logs you got, with the full traceback.\n\n        Tip: You can attach images or log files by dragging them into the text area..\n      placeholder: |\n        ```\n        The error message or logs you got, with the full traceback.\n        ```\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Additional information\n      description: |\n        Tell us anything else you think we should know.\n\n        Tip: You can attach images or log files by dragging them into the text area.\n      placeholder: |\n        1. What's your expected result?\n        2. What dataset did you use?\n        3. What do you think might be the reason?\n\n  - type: markdown\n    attributes:\n      value: |\n        ## Acknowledgement\n        Thanks for taking the time to fill out this report.\n\n        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!\n        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.\n\n        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/2-feature_request.yml",
    "content": "name: 🚀 Feature request\ndescription: Suggest an idea for this project\nlabels: [feature-request]\ntitle: \"[Feature] \"\n\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        ## Note\n        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)\n\n        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**\n\n  - type: textarea\n    attributes:\n      label: What is the feature?\n      description: Tell us more about the feature and how this feature can help.\n      placeholder: |\n        E.g., It is inconvenient when \\[....\\].\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Any other context?\n      description: |\n        Have you considered any alternative solutions or features? If so, what are they? Also, feel free to add any other context or screenshots about the feature request here.\n\n  - type: markdown\n    attributes:\n      value: |\n        ## Acknowledgement\n        Thanks for taking the time to fill out this report.\n\n        We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!\n        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.\n\n        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/3-documentation.yml",
    "content": "name: 📚 Documentation\ndescription: Report an issue related to the documentation.\nlabels: \"docs\"\ntitle: \"[Docs] \"\n\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        ## Note\n        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)\n        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**\n\n  - type: textarea\n    attributes:\n      label: 📚 The doc issue\n      description: >\n        A clear and concise description the issue.\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Suggest a potential alternative/fix\n      description: >\n        Tell us how we could improve the documentation in this regard.\n\n  - type: markdown\n    attributes:\n      value: |\n        ## Acknowledgement\n        Thanks for taking the time to fill out this report.\n\n        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**here**](https://github.com/open-mmlab/mmcv/pulls)!\n        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.\n\n        Welcome to join our [**Community(TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: false\n\ncontact_links:\n  - name: 💬 Forum\n    url: https://github.com/open-mmlab/mmcv/discussions\n    about: Ask general usage questions and discuss with other mmcv community members\n  - name: MMCV Documentation\n    url: https://mmcv.readthedocs.io/en/latest/\n    about: Check if your question is answered in docs\n  - name: 🌐 Explore OpenMMLab\n    url: https://openmmlab.com/\n    about: Get know more about OpenMMLab\n"
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.\n\n## Motivation\n\nPlease describe the motivation of this PR and the goal you want to achieve through this PR.\n\n## Modification\n\nPlease briefly describe what modification is made in this PR.\n\n## BC-breaking (Optional)\n\nDoes the modification introduce changes that break the backward-compatibility of the downstream repositories?\nIf so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.\n\n## Use cases (Optional)\n\nIf this PR introduces a new feature, it is better to list some use cases here, and update the documentation.\n\n## Checklist\n\n**Before PR**:\n\n- [ ] I have read and followed the workflow indicated in the [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) to create this PR.\n- [ ] Pre-commit or linting tools indicated in [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) are used to fix the potential lint issues.\n- [ ] Bug fixes are covered by unit tests, the case that causes the bug should be added in the unit tests.\n- [ ] New functionalities are covered by complete unit tests. If not, please add more unit test to ensure the correctness.\n- [ ] The documentation has been modified accordingly, including docstring or example tutorials.\n\n**After PR**:\n\n- [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with some of those projects, like MMDet or MMCls.\n- [ ] CLA has been signed and all committers have signed the CLA in this PR.\n"
  },
  {
    "path": ".github/workflows/build_macos_wheel.yml",
    "content": "name: build macos wheel\n\non: push\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build_macos10_wheel:\n    runs-on: macos-latest\n    if: contains(github.event.head_commit.message, 'Bump version to')\n    strategy:\n      matrix:\n        torch: [1.8.0, 1.9.0, 1.10.0, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]\n        python-version: [3.7, 3.8, 3.9, '3.10', '3.11']\n        include:\n          - torch: 1.8.0\n            torchvision: 0.9.0\n          - torch: 1.9.0\n            torchvision: 0.10.0\n          - torch: 1.10.0\n            torchvision: 0.11.0\n          - torch: 1.11.0\n            torchvision: 0.12.0\n          - torch: 1.12.0\n            torchvision: 0.13.0\n          - torch: 1.13.0\n            torchvision: 0.14.0\n          - torch: 2.0.0\n            torchvision: 0.15.1\n          - torch: 2.1.0\n            torchvision: 0.16.0\n        exclude:\n          - torch: 1.8.0\n            python-version: '3.10'\n          - torch: 1.9.0\n            python-version: '3.10'\n          - torch: 1.10.0\n            python-version: '3.10'\n          - torch: 1.8.0\n            python-version: '3.11'\n          - torch: 1.9.0\n            python-version: '3.11'\n          - torch: 1.10.0\n            python-version: '3.11'\n          - torch: 1.10.0\n            python-version: '3.11'\n          - torch: 1.11.0\n            python-version: '3.11'\n          - torch: 1.12.0\n            python-version: '3.11'\n          - torch: 1.13.0\n            python-version: '3.11'\n          - torch: 2.0.0\n            python-version: 3.7\n          - torch: 2.1.0\n            python-version: 3.7\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install psutil\n        run: pip install psutil\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir\n      - name: Build and install\n        run: |\n          pip install wheel\n          python setup.py bdist_wheel\n      - uses: actions/upload-artifact@v3\n        with:\n          name: ${{matrix.torch}}\n          path: dist/\n"
  },
  {
    "path": ".github/workflows/lint.yml",
    "content": "name: lint\n\non: [push, pull_request]\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  lint:\n    runs-on: ubuntu-22.04\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python 3.10.15\n        uses: actions/setup-python@v2\n        with:\n          python-version: '3.10.15'\n      - name: Install pre-commit hook\n        run: |\n          pip install pre-commit\n          pre-commit install\n      - name: Linting\n        run: pre-commit run --all-files\n      - name: Format c/cuda codes with clang-format\n        uses: DoozyX/clang-format-lint-action@v0.18\n        with:\n          source: mmcv/ops/csrc\n          extensions: h,c,cpp,hpp,cu,cuh\n          style: google\n"
  },
  {
    "path": ".github/workflows/merge_stage_test.yml",
    "content": "name: merge_stage_test\n\non:\n  push:\n    paths-ignore:\n      - \".github/**.md\"\n      - \"docker/**\"\n      - \"docs/**\"\n      - 'examples/**'\n      - '.dev_scripts/**'\n      - \"README.md\"\n      - \"README_zh-CN.md\"\n      - \"CONTRIBUTING.md\"\n      - \".pre-commit-config.yaml\"\n      - \".pre-commit-config-zh-cn.yaml\"\n    branches:\n      - main\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build_without_torch:\n    runs-on: ubuntu-22.04\n    env:\n      MMCV_WITH_OPS: 0\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install system dependencies\n        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests/test_image tests/test_transforms tests/test_video tests/test_arraymisc.py tests/test_visualization.py tests/test_utils/test_env.py --ignore=tests/test_image/test_io.py\n  build_without_ops:\n    runs-on: ubuntu-22.04\n    env:\n      MMCV_WITH_OPS: 0\n    strategy:\n      matrix:\n        python-version: [3.7]\n        torch: [1.8.1, 1.9.1]\n        include:\n          - torch: 1.8.1\n            torchvision: 0.9.1\n          - torch: 1.9.1\n            torchvision: 0.10.1\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install system dependencies\n        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests --ignore=tests/test_ops\n  build_cpu_py:\n    runs-on: ubuntu-22.04\n    strategy:\n      matrix:\n        python-version: [3.8, 3.9, '3.10']\n        torch: [1.13.0]\n        include:\n          - torch: 1.13.0\n            torchvision: 0.14.0\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install system dependencies\n        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests and generate coverage report\n        run: |\n          coverage run --branch --source mmcv -m pytest tests/\n          coverage xml\n          coverage report -m\n  build_cpu_pt:\n    runs-on: ubuntu-22.04\n    strategy:\n      matrix:\n        python-version: [3.7]\n        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]\n        include:\n          - torch: 1.8.1\n            torchvision: 0.9.1\n          - torch: 1.9.1\n            torchvision: 0.10.1\n          - torch: 1.10.1\n            torchvision: 0.11.2\n          - torch: 1.11.0\n            torchvision: 0.12.0\n          - torch: 1.12.0\n            torchvision: 0.13.0\n          - torch: 1.13.0\n            torchvision: 0.14.0\n          - torch: 2.0.0\n            torchvision: 0.15.1\n            python-version: 3.8\n          - torch: 2.1.0\n            torchvision: 0.16.0\n            python-version: 3.8\n        exclude:\n          - torch: 2.0.0\n            python-version: 3.7\n          - torch: 2.1.0\n            python-version: 3.7\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install system dependencies\n        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests and generate coverage report\n        run: |\n          coverage run --branch --source mmcv -m pytest tests/\n          coverage xml\n          coverage report -m\n      # Only upload coverage report for python3.7 && pytorch1.8.1 cpu\n      - name: Upload coverage to Codecov\n        if: ${{matrix.torch == '1.8.1' && matrix.python-version == '3.8'}}\n        uses: codecov/codecov-action@v1.0.14\n        with:\n          file: ./coverage.xml\n          flags: unittests\n          env_vars: OS,PYTHON\n          name: codecov-umbrella\n          fail_ci_if_error: false\n  build_cu102:\n    runs-on: ubuntu-22.04\n    container:\n      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel\n    env:\n      FORCE_CUDA: 1\n      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Fetch GPG keys\n        run: |\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n      - name: Install system dependencies\n        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests and generate coverage report\n        run: |\n          coverage run --branch --source mmcv -m pytest tests/\n          coverage xml\n          coverage report -m\n  build_cu111:\n    runs-on: ubuntu-22.04\n    container:\n      image: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel\n    env:\n      FORCE_CUDA: 1\n      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Fetch GPG keys\n        run: |\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n      - name: Install system dependencies\n        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests and generate coverage report\n        run: |\n          coverage run --branch --source mmcv -m pytest tests/\n          coverage xml\n          coverage report -m\n  build_cu116:\n    runs-on: ubuntu-22.04\n    container:\n      image: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel\n    env:\n      FORCE_CUDA: 1\n      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Fetch GPG keys\n        run: |\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n      - name: Install system dependencies\n        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests and generate coverage report\n        run: |\n          coverage run --branch --source mmcv -m pytest tests\n          coverage xml\n          coverage report -m\n  build_windows_without_ops:\n    runs-on: windows-2019\n    env:\n      MMCV_WITH_OPS: 0\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: python -m pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py\n  build_windows:\n    runs-on: windows-2019\n    strategy:\n      matrix:\n        torch: [1.8.1, 2.1.0]\n        include:\n          - torch: 1.8.1\n            torchvision: 0.9.1\n            python-version: 3.7\n          - torch: 2.1.0\n            torchvision: 0.16.0\n            python-version: 3.8\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: python -m pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests/ --ignore tests/test_image/test_io.py\n  build_macos:\n    runs-on: macos-latest\n    strategy:\n      matrix:\n        torch: [1.8.1, 2.1.0]\n        include:\n          - torch: 1.8.1\n            torchvision: 0.9.1\n            python-version: 3.7\n          - torch: 2.1.0\n            torchvision: 0.16.0\n            python-version: 3.8\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install system dependencies\n        run: brew install ffmpeg jpeg-turbo\n      - name: Upgrade pip and wheel\n        run: pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests/\n"
  },
  {
    "path": ".github/workflows/pr_stage_test.yml",
    "content": "name: pr_stage_test\n\non:\n  pull_request:\n    paths-ignore:\n      - \".github/**.md\"\n      - \"docker/**\"\n      - \"docs/**\"\n      - 'examples/**'\n      - '.dev_scripts/**'\n      - \"README.md\"\n      - \"README_zh-CN.md\"\n      - \"CONTRIBUTING.md\"\n      - \".pre-commit-config.yaml\"\n      - \".pre-commit-config-zh-cn.yaml\"\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build_cu121:\n    runs-on: ubuntu-22.04\n    container:\n      image: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04\n    strategy:\n      matrix:\n        python-version: [\"3.10\"]\n        torch: [\"2.5.1\"]\n    steps:\n      - uses: actions/checkout@v3\n      - name: Install basic tools\n        run: |\n          apt-get update\n          apt-get install -y wget build-essential git software-properties-common\n\n      # 安装特定版本的 Python\n      - name: Install Python ${{ matrix.python-version }}\n        run: |\n          add-apt-repository ppa:deadsnakes/ppa -y\n          apt-get update\n          apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils\n          update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1\n          update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1\n          wget https://bootstrap.pypa.io/get-pip.py\n          python get-pip.py\n\n      # 安装 PyTorch\n      - name: Install PyTorch\n        run: |\n          pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu121\n      - name: Fetch GPG keys\n        run: |\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n      - name: Install system dependencies\n        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMEngine from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: |\n          pip install -r requirements/test.txt\n        # Distributed related unit test may randomly error in PyTorch 1.13.0\n      - name: Run unittests and generate coverage report\n        run: |\n          coverage run --branch --source mmengine -m pytest tests/  --ignore tests/test_dist/\n          coverage xml\n          coverage report -m\n\n  build_cu118:\n    runs-on: ubuntu-22.04\n    container:\n      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04\n    strategy:\n      matrix:\n        python-version: [\"3.10\"]\n        torch: [\"2.0.0\", \"2.1.0\",\"2.3.1\"]\n    steps:\n      - uses: actions/checkout@v3\n      - name: Install basic tools\n        run: |\n          apt-get update\n          apt-get install -y wget build-essential git software-properties-common\n\n      # 安装特定版本的 Python\n      - name: Install Python ${{ matrix.python-version }}\n        run: |\n          add-apt-repository ppa:deadsnakes/ppa -y\n          apt-get update\n          apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils\n          update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1\n          update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1\n          wget https://bootstrap.pypa.io/get-pip.py\n          python get-pip.py\n\n      # 安装 PyTorch\n      - name: Install PyTorch\n        run: |\n          pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu118\n      - name: Fetch GPG keys\n        run: |\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub\n          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n      - name: Install system dependencies\n        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMEngine from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: |\n          pip install -r requirements/test.txt\n          pip install numpy==1.24.3\n        # Distributed related unit test may randomly error in PyTorch 1.13.0\n      - name: Run unittests and generate coverage report\n        run: |\n          coverage run --branch --source mmengine -m pytest tests/  --ignore tests/test_dist/\n          coverage xml\n          coverage report -m\n\n  build_windows_without_ops:\n    runs-on: windows-2019\n    env:\n      MMCV_WITH_OPS: 0\n    strategy:\n      matrix:\n        python-version: [3.7]\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: python -m pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py\n  build_windows:\n    runs-on: windows-2019\n    strategy:\n      matrix:\n        torch: [1.8.1, 2.1.0]\n        include:\n          - torch: 1.8.1\n            torchvision: 0.9.1\n            python-version: 3.7\n          - torch: 2.1.0\n            torchvision: 0.16.0\n            python-version: 3.8\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Upgrade pip and wheel\n        run: python -m pip install pip wheel --upgrade\n      - name: Install PyTorch\n        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html\n      - name: Install MMEngine from main branch\n        run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n      - name: Install ninja to speed the compilation\n        run: pip install ninja psutil\n      - name: Build MMCV from source\n        run: pip install -e . -v\n      - name: Install unit tests dependencies\n        run: pip install -r requirements/test.txt\n      - name: Run unit tests\n        run: pytest tests/ --ignore tests/test_image/test_io.py\n  # build_macos:\n  #   runs-on: macos-latest\n  #   strategy:\n  #     matrix:\n  #       torch: [1.8.1, 2.1.0]\n  #       include:\n  #         - torch: 1.8.1\n  #           torchvision: 0.9.1\n  #           python-version: 3.7.1\n  #         - torch: 2.1.0\n  #           torchvision: 0.16.0\n  #           python-version: 3.8.1\n  #   steps:\n  #     - uses: actions/checkout@v2\n  #     - name: Set up Python ${{ matrix.python-version }}\n  #       uses: actions/setup-python@v2\n  #       with:\n  #         python-version: ${{ matrix.python-version }}\n  #     - name: Install system dependencies\n  #       run: brew install ffmpeg jpeg-turbo\n  #     - name: Upgrade pip and wheel\n  #       run: pip install pip wheel --upgrade\n  #     - name: Install PyTorch\n  #       run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}\n  #     - name: Install MMEngine from main branch\n  #       run: pip install git+https://github.com/open-mmlab/mmengine.git@main\n  #     - name: Install ninja to speed the compilation\n  #       run: pip install ninja psutil\n  #     - name: Build MMCV from source\n  #       run: pip install -e . -v\n  #     - name: Install unit tests dependencies\n  #       run: pip install -r requirements/test.txt\n  #     - name: Run unit tests\n  #       run: pytest tests/\n"
  },
  {
    "path": ".github/workflows/publish-to-pypi.yml",
    "content": "name: deploy\n\non: push\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build-n-publish_without_ops:\n    runs-on: ubuntu-22.04\n    if: startsWith(github.event.ref, 'refs/tags')\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python 3.7\n        uses: actions/setup-python@v1\n        with:\n          python-version: 3.7\n      - name: Upgrade Setuptools\n        run: pip install setuptools wheel --upgrade\n      - name: Build MMCV\n        run: |\n          sed -i \"s/os.getenv('MMCV_WITH_OPS', '1')/os.getenv('MMCV_WITH_OPS', '0')/g\" setup.py\n          python setup.py sdist bdist_wheel\n      - name: Publish distribution to PyPI\n        run: |\n          pip install twine\n          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}\n\n  build-n-publish_with_ops:\n    runs-on: ubuntu-22.04\n    if: startsWith(github.event.ref, 'refs/tags')\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python 3.7\n        uses: actions/setup-python@v1\n        with:\n          python-version: 3.7\n      - name: Upgrade Setuptools\n        run: pip install setuptools --upgrade\n      - name: Build MMCV with ops\n        run: python setup.py sdist\n      - name: Publish distribution to PyPI\n        run: |\n          pip install twine\n          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# PyTorch checkpoint\n*.pth\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\nmlu-ops/\nmlu-ops.*\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/en/_build/\ndocs/en/api/generated/\ndocs/zh_cn/_build/\ndocs/zh_cn/api/generated/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n# editors and IDEs\n.idea/\n.vscode/\n\n# custom\n.DS_Store\n\n# datasets and logs and checkpoints\ndata/\nwork_dir/\n\nsrc/\n"
  },
  {
    "path": ".pre-commit-config-zh-cn.yaml",
    "content": "exclude: ^tests/data/\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit\n    rev: v4.0.0\n    hooks:\n      - id: validate_manifest\n  - repo: https://github.com/PyCQA/flake8\n    rev: 7.1.1\n    hooks:\n      - id: flake8\n  - repo: https://gitee.com/openmmlab/mirrors-isort\n    rev: 5.11.5\n    hooks:\n      - id: isort\n  - repo: https://gitee.com/openmmlab/mirrors-yapf\n    rev: v0.32.0\n    hooks:\n      - id: yapf\n  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks\n    rev: v5.0.0\n    hooks:\n      - id: trailing-whitespace\n      - id: check-yaml\n      - id: end-of-file-fixer\n      - id: requirements-txt-fixer\n      - id: double-quote-string-fixer\n      - id: check-merge-conflict\n      - id: fix-encoding-pragma\n        args: [\"--remove\"]\n      - id: mixed-line-ending\n        args: [\"--fix=lf\"]\n  - repo: https://gitee.com/openmmlab/mirrors-codespell\n    rev: v2.2.1\n    hooks:\n      - id: codespell\n  - repo: https://gitee.com/openmmlab/mirrors-mdformat\n    rev: 0.7.9\n    hooks:\n      - id: mdformat\n        args: [\"--number\"]\n        additional_dependencies:\n          - mdformat-openmmlab\n          - mdformat_frontmatter\n          - linkify-it-py\n  - repo: https://gitee.com/openmmlab/mirrors-docformatter\n    # TODO:https://github.com/PyCQA/docformatter/issues/289\n    rev: v1.3.1\n    hooks:\n      - id: docformatter\n        args: [\"--in-place\", \"--wrap-descriptions\", \"79\"]\n  - repo: https://github.com/asottile/pyupgrade\n    rev: v3.0.0\n    hooks:\n      - id: pyupgrade\n        args: [\"--py36-plus\"]\n  - repo: https://gitee.com/openmmlab/pre-commit-hooks\n    rev: v0.2.0  # Use the ref you want to point at\n    hooks:\n      - id: check-copyright\n        args: [\"mmcv\", \"tests\", \"--excludes\", \"mmcv/ops\"]\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.2.0\n    hooks:\n      - id: mypy\n        exclude: |-\n          (?x)(\n              ^test\n              | ^docs\n          )\n        additional_dependencies: [\"types-setuptools\", \"types-requests\"]\n  # - repo: local\n  #   hooks:\n  #     - id: clang-format\n  #       name: clang-format\n  #       description: Format files with ClangFormat\n  #       entry: clang-format -style=google -i\n  #       language: system\n  #       files: \\.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "exclude: ^tests/data/\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit\n    rev: v4.0.0\n    hooks:\n      - id: validate_manifest\n  - repo: https://github.com/PyCQA/flake8\n    rev: 7.1.1\n    hooks:\n      - id: flake8\n  - repo: https://github.com/PyCQA/isort\n    rev: 5.11.5\n    hooks:\n      - id: isort\n  - repo: https://github.com/pre-commit/mirrors-yapf\n    rev: v0.32.0\n    hooks:\n      - id: yapf\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v5.0.0\n    hooks:\n      - id: trailing-whitespace\n      - id: check-yaml\n      - id: end-of-file-fixer\n      - id: requirements-txt-fixer\n      - id: double-quote-string-fixer\n      - id: check-merge-conflict\n      - id: fix-encoding-pragma\n        args: [\"--remove\"]\n      - id: mixed-line-ending\n        args: [\"--fix=lf\"]\n  - repo: https://github.com/codespell-project/codespell\n    rev: v2.2.1\n    hooks:\n      - id: codespell\n  - repo: https://github.com/executablebooks/mdformat\n    rev: 0.7.9\n    hooks:\n      - id: mdformat\n        args: [\"--number\"]\n        additional_dependencies:\n          - mdformat-openmmlab\n          - mdformat_frontmatter\n          - linkify-it-py\n  - repo: https://github.com/myint/docformatter\n    rev: 06907d0\n    hooks:\n      - id: docformatter\n        args: [\"--in-place\", \"--wrap-descriptions\", \"79\"]\n  - repo: https://github.com/asottile/pyupgrade\n    rev: v3.0.0\n    hooks:\n      - id: pyupgrade\n        args: [\"--py36-plus\"]\n  - repo: https://github.com/open-mmlab/pre-commit-hooks\n    rev: v0.2.0  # Use the ref you want to point at\n    hooks:\n      - id: check-copyright\n        args: [\"mmcv\", \"tests\", \"--excludes\", \"mmcv/ops\"]\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: v1.2.0\n    hooks:\n      - id: mypy\n        exclude: |-\n          (?x)(\n              ^test\n              | ^docs\n          )\n        additional_dependencies: [\"types-setuptools\", \"types-requests\"]\n  # - repo: local\n  #   hooks:\n  #     - id: clang-format\n  #       name: clang-format\n  #       description: Format files with ClangFormat\n  #       entry: clang-format -style=google -i\n  #       language: system\n  #       files: \\.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$\n"
  },
  {
    "path": ".readthedocs.yml",
    "content": "version: 2\n\nformats: all\n\nbuild:\n  os: ubuntu-22.04\n  tools:\n    python: \"3.7\"\n\npython:\n  install:\n    - requirements: requirements/runtime.txt\n    - requirements: requirements/docs.txt\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.0\nmessage: \"If you use this software, please cite it as below.\"\nauthors:\n  - name: \"MMCV Contributors\"\ntitle: \"OpenMMLab Computer Vision Foundation\"\ndate-released: 2018-08-22\nurl: \"https://github.com/open-mmlab/mmcv\"\nlicense: Apache-2.0\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "## Contributing to OpenMMLab\n\nWelcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to\n\n**Fix bug**\n\nYou can directly post a Pull Request to fix typo in code or documents\n\nThe steps to fix the bug of code implementation are as follows.\n\n1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.\n\n2. Posting a pull request after fixing the bug and adding corresponding unit test.\n\n**New Feature or Enhancement**\n\n1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.\n2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.\n\n**Document**\n\nYou can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.\n\n### Pull Request Workflow\n\nIf you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)\n\n#### 1. Fork and clone\n\nIf you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png\" width=\"1200\">\n\nThen, you can clone the repositories to local:\n\n```shell\ngit clone git@github.com:{username}/mmcv.git\n```\n\nAfter that, you should ddd official repository as the upstream repository\n\n```bash\ngit remote add upstream git@github.com:open-mmlab/mmcv\n```\n\nCheck whether remote repository has been added successfully by `git remote -v`\n\n```bash\norigin\tgit@github.com:{username}/mmcv.git (fetch)\norigin\tgit@github.com:{username}/mmcv.git (push)\nupstream\tgit@github.com:open-mmlab/mmcv (fetch)\nupstream\tgit@github.com:open-mmlab/mmcv (push)\n```\n\n> Here's a brief introduction to origin and upstream. When we use \"git clone\", we create an \"origin\" remote by default, which points to the repository cloned from. As for \"upstream\", we add it ourselves to point to the target repository. Of course, if you don't like the name \"upstream\", you could name it as you wish. Usually, we'll push the code to \"origin\". If the pushed code conflicts with the latest code in official(\"upstream\"), we should pull the latest code from upstream to resolve the conflicts, and then push to \"origin\" again. The posted Pull Request will be updated automatically.\n\n#### 2. Configure pre-commit\n\nYou should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.\n\n```shell\npip install -U pre-commit\npre-commit install\n```\n\nCheck that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.\n\n```shell\npre-commit run --all-files\n```\n\n<img src=\"https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png\" width=\"1200\">\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png\" width=\"1200\">\n\nIf the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.\n\nIf the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png\" width=\"1200\">\n\nIf we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**).\n\n```shell\ngit commit -m \"xxx\" --no-verify\n```\n\n#### 3. Create a development branch\n\nAfter configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`\n\n```shell\ngit checkout -b yhc/refactor_contributing_doc\n```\n\nIn subsequent development, if the master branch of the local repository is behind the master branch of \"upstream\", we need to pull the upstream for synchronization, and then execute the above command:\n\n```shell\ngit pull upstream master\n```\n\n#### 4. Commit the code and pass the unit test\n\n- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).\n\n- The committed code should pass through the unit test\n\n  ```shell\n  # Pass all unit tests\n  pytest tests\n\n  # Pass the unit test of runner\n  pytest tests/test_runner/test_runner.py\n  ```\n\n  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)\n\n- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)\n\n#### 5. Push the code to remote\n\nWe could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.\n\n```shell\ngit push -u origin {branch_name}\n```\n\nThis will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.\n\n#### 6. Create a Pull Request\n\n(1) Create a pull request in GitHub's Pull request interface\n\n<img src=\"https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png\" width=\"1200\">\n\n(2) Modify the PR description according to the guidelines so that other developers can better understand your changes\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png\" width=\"1200\">\n\nFind more details about Pull Request description in [pull request guidelines](#pr-specs).\n\n**note**\n\n(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)\n\n(b) If it is your first contribution, please sign the CLA\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png\" width=\"1200\">\n\n(c) Check whether the Pull Request pass through the CI\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png\" width=\"1200\">\n\nMMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.\n\n(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png\" width=\"1200\">\n\n#### 7. Resolve conflicts\n\nIf your local branch conflicts with the latest master branch of \"upstream\", you'll need to resolove them. There are two ways to do this:\n\n```shell\ngit fetch --all --prune\ngit rebase upstream/master\n```\n\nor\n\n```shell\ngit fetch --all --prune\ngit merge upstream/master\n```\n\nIf you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.\n\n### Guidance\n\n#### Unit test\n\nIf you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies:\n\n```shell\n# Linux\nsudo apt-get update -y\nsudo apt-get install -y libturbojpeg\nsudo apt-get install -y ffmpeg\n\n# Windows\nconda install ffmpeg\n```\n\nWe should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:\n\n```shell\npython -m coverage run -m pytest /path/to/test_file\npython -m coverage html\n# check file in htmlcov/index.html\n```\n\n#### Document rendering\n\nIf the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:\n\n```shell\npip install -r requirements/docs.txt\ncd docs/zh_cn/\n# or docs/en\nmake html\n# check file in ./docs/zh_cn/_build/html/index.html\n```\n\n### Code style\n\n#### Python\n\nWe adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.\n\nWe use the following tools for linting and formatting:\n\n- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.\n- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.\n- [yapf](https://github.com/google/yapf): A formatter for Python files.\n- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.\n- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.\n- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.\n\nStyle configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).\n\nWe use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,\nfixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.\nThe config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).\n\n#### C++ and CUDA\n\nWe follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).\n\n### PR Specs\n\n1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style\n\n2. One short-time branch should be matched with only one PR\n\n3. Accomplish a detailed change in one PR. Avoid large PR\n\n   - Bad: Support Faster R-CNN\n   - Acceptable: Add a box head to Faster R-CNN\n   - Good: Add a parameter to box head to support custom conv-layer number\n\n4. Provide clear and significant commit message\n\n5. Provide clear and meaningful PR description\n\n   - Task name should be clarified in title. The general format is: \\[Prefix\\] Short description of the PR (Suffix)\n   - Prefix: add new feature \\[Feature\\], fix bug \\[Fix\\], related to documents \\[Docs\\], in developing \\[WIP\\] (which will not be reviewed temporarily)\n   - Introduce main changes, results and influences on other modules in short description\n   - Associate related issues and pull requests with a milestone\n"
  },
  {
    "path": "CONTRIBUTING_zh-CN.md",
    "content": "## 贡献代码\n\n欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于\n\n**修复错误**\n\n修复代码实现错误的步骤如下：\n\n1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。\n2. 修复错误并补充相应的单元测试，提交拉取请求。\n\n**新增功能或组件**\n\n1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。\n2. 实现新增功能并添单元测试，提交拉取请求。\n\n**文档补充**\n\n修复文档可以直接提交拉取请求\n\n添加文档或将文档翻译成其他语言步骤如下\n\n1. 提交 issue，确认添加文档的必要性。\n2. 添加文档，提交拉取请求。\n\n### 拉取请求工作流\n\n如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)\n\n#### 1. 复刻仓库\n\n当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png\" width=\"1200\">\n\n将代码克隆到本地\n\n```shell\ngit clone git@github.com:{username}/mmcv.git\n```\n\n添加原代码库为上游代码库\n\n```bash\ngit remote add upstream git@github.com:open-mmlab/mmcv\n```\n\n检查 remote 是否添加成功，在终端输入 `git remote -v`\n\n```bash\norigin\tgit@github.com:{username}/mmcv.git (fetch)\norigin\tgit@github.com:{username}/mmcv.git (push)\nupstream\tgit@github.com:open-mmlab/mmcv (fetch)\nupstream\tgit@github.com:open-mmlab/mmcv (push)\n```\n\n> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。\n\n#### 2. 配置 pre-commit\n\n在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:\n\n```shell\npip install -U pre-commit\npre-commit install\n```\n\n检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：\n\n```shell\npre-commit run --all-files\n```\n\n<img src=\"https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png\" width=\"1200\">\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png\" width=\"1200\">\n\n> 如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源\n\n> pre-commit install -c .pre-commit-config-zh-cn.yaml\n\n> pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml\n\n如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。\n\n如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png\" width=\"1200\">\n\n如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。\n\n```shell\ngit commit -m \"xxx\" --no-verify\n```\n\n#### 3. 创建开发分支\n\n安装完 pre-commit 之后，我们需要基于 master 创建开发分支，建议的分支命名规则为 `username/pr_name`。\n\n```shell\ngit checkout -b yhc/refactor_contributing_doc\n```\n\n在后续的开发中，如果本地仓库的 master 分支落后于 upstream 的 master 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令\n\n```shell\ngit pull upstream master\n```\n\n#### 4. 提交代码并在本地通过单元测试\n\n- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。\n\n- 提交的代码同样需要通过单元测试\n\n  ```shell\n  # 通过全量单元测试\n  pytest tests\n\n  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例\n  pytest tests/test_runner/test_runner.py\n  ```\n\n  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)\n\n- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。\n\n#### 5. 推送代码到远程\n\n代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支\n\n```shell\ngit push -u origin {branch_name}\n```\n\n这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。\n\n#### 6. 提交拉取请求（PR）\n\n(1) 在 GitHub 的 Pull request 界面创建拉取请求\n<img src=\"https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png\" width=\"1200\">\n\n(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png\" width=\"1200\">\n\n描述规范详见[拉取请求规范](#拉取请求规范)\n\n&#160;\n\n**注意事项**\n\n(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）\n\n(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png\" width=\"1200\">\n\n(c) 检查提交的 PR 是否通过 CI（集成测试）\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png\" width=\"1200\">\n\nMMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。\n\n(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png\" width=\"1200\">\n\n所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。\n\n#### 7. 解决冲突\n\n随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：\n\n```shell\ngit fetch --all --prune\ngit rebase upstream/master\n```\n\n或者\n\n```shell\ngit fetch --all --prune\ngit merge upstream/master\n```\n\n如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。\n\n### 指引\n\n#### 单元测试\n\n如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖\n\n```shell\n# Linux\nsudo apt-get update -y\nsudo apt-get install -y libturbojpeg\nsudo apt-get install -y ffmpeg\n\n# Windows\nconda install ffmpeg\n```\n\n在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下\n\n```shell\npython -m coverage run -m pytest /path/to/test_file\npython -m coverage html\n# check file in htmlcov/index.html\n```\n\n#### 文档渲染\n\n在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。\n本地生成渲染后的文档的方法如下\n\n```shell\npip install -r requirements/docs.txt\ncd docs/zh_cn/\n# or docs/en\nmake html\n# check file in ./docs/zh_cn/_build/html/index.html\n```\n\n### 代码风格\n\n#### Python\n\n[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码\n\n- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装\n- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具\n- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具\n- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误\n- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具\n- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具\n\nyapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到\n\n通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，\n修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。\npre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。\n\npre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。\n\n更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。\n\n#### C++ and CUDA\n\nC++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)\n\n### 拉取请求规范\n\n1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题\n\n2. 一个`拉取请求`对应一个短期分支\n\n3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`\n\n   - Bad：实现 Faster R-CNN\n   - Acceptable：给 Faster R-CNN 添加一个 box head\n   - Good：给 box head 增加一个参数来支持自定义的 conv 层数\n\n4. 每次 Commit 时需要提供清晰且有意义 commit 信息\n\n5. 提供清晰且有意义的`拉取请求`描述\n\n   - 标题写明白任务名称，一般格式:\\[Prefix\\] Short description of the pull request (Suffix)\n   - prefix: 新增功能 \\[Feature\\], 修 bug \\[Fix\\], 文档相关 \\[Docs\\], 开发中 \\[WIP\\] (暂时不会被review)\n   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板\n   - 关联相关的`议题` (issue) 和其他`拉取请求`\n\n6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) OpenMMLab. All rights reserved\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2018-2020 Open-MMLab. All rights reserved.\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "LICENSES.md",
    "content": "# Licenses for special operations\n\nIn this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.\n\n|    Operation     |                                                                             Files                                                                              |    License     |\n| :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |\n|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |\n| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |\n|     bias_act     |             [mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu)             | NVIDIA License |\n|  filtered_lrelu  |            [mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu)            | NVIDIA License |\n|  conv2d_gradfix  |                              [mmcv/ops/conv2d_gradfix.py](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/conv2d_gradfix.py)                              | NVIDIA License |\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include requirements/runtime.txt\ninclude mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp\ninclude mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp\ninclude mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp\ninclude mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm\nrecursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n  <img src=\"https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png\" width=\"300\"/>\n  <div>&nbsp;</div>\n  <div align=\"center\">\n    <b><font size=\"5\">OpenMMLab website</font></b>\n    <sup>\n      <a href=\"https://openmmlab.com\">\n        <i><font size=\"4\">HOT</font></i>\n      </a>\n    </sup>\n    &nbsp;&nbsp;&nbsp;&nbsp;\n    <b><font size=\"5\">OpenMMLab platform</font></b>\n    <sup>\n      <a href=\"https://platform.openmmlab.com\">\n        <i><font size=\"4\">TRY IT OUT</font></i>\n      </a>\n    </sup>\n  </div>\n  <div>&nbsp;</div>\n\n[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)\n[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)\n[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)\n[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)\n[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)\n[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)\n[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)\n[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)\n\n[📘Documentation](https://mmcv.readthedocs.io/en/latest/) |\n[🛠️Installation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) |\n[🤔Reporting Issues](https://github.com/open-mmlab/mmcv/issues/new/choose)\n\n</div>\n\n<div align=\"center\">\n\nEnglish | [简体中文](README_zh-CN.md)\n\n</div>\n\n## Highlights\n\nThe OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.\n\nMMCV v2.0.0 official version was released on April 6, 2023. In version 2.x, it removed components related to the training process and added a data transformation module. Also, starting from 2.x, it renamed the package names **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv**. For details, see [Compatibility Documentation](docs/en/compatibility.md).\n\nMMCV will maintain both [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (corresponding to the original [master](https://github.com/open-mmlab/mmcv/tree/master) branch) and **2.x** (corresponding to the **main** branch, now the default branch) versions simultaneously. For details, see [Branch Maintenance Plan](README.md#branch-maintenance-plan).\n\n## Introduction\n\nMMCV is a foundational library for computer vision research and it provides the following functionalities:\n\n- [Image/Video processing](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_process.html)\n- [Image and annotation visualization](https://mmcv.readthedocs.io/en/latest/understand_mmcv/visualization.html)\n- [Image transformation](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_transform.html)\n- [Various CNN architectures](https://mmcv.readthedocs.io/en/latest/understand_mmcv/cnn.html)\n- [High-quality implementation of common CPU and CUDA ops](https://mmcv.readthedocs.io/en/latest/understand_mmcv/ops.html)\n\nIt supports the following systems:\n\n- Linux\n- Windows\n- macOS\n\nSee the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.\n\nNote: MMCV requires Python 3.7+.\n\n## Installation\n\nThere are two versions of MMCV:\n\n- **mmcv**: comprehensive, with full features and various CUDA ops out of the box. It takes longer time to build.\n- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\\<1.0.0. It is useful when you do not need those CUDA ops.\n\n**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.\n\n### Install mmcv\n\nBefore installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). For apple silicon users, please use PyTorch 1.13+.\n\nThe command to install mmcv:\n\n```bash\npip install -U openmim\nmim install mmcv\n```\n\nIf you need to specify the version of mmcv, you can use the following command:\n\n```bash\nmim install mmcv==2.0.0\n```\n\nIf you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html).\n\n<details>\n<summary>Installation log using pre-built packages</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv<br />\n<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>\n\n</details>\n\n<details>\n<summary>Installation log using source packages</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv==2.0.0<br />\n<b>Downloading mmcv-2.0.0.tar.gz</b>\n\n</details>\n\nFor more installation methods, please refer to the [Installation documentation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html).\n\n### Install mmcv-lite\n\nIf you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).\n\n```bash\npip install -U openmim\nmim install mmcv-lite\n```\n\n## FAQ\n\nIf you face some installation issues, CUDA related issues or RuntimeErrors,\nyou may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).\n\nIf you face installation problems or runtime issues, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html) to see if there is a solution. If the problem is still not solved, feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).\n\n## Citation\n\nIf you find this project useful in your research, please consider cite:\n\n```latex\n@misc{mmcv,\n    title={{MMCV: OpenMMLab} Computer Vision Foundation},\n    author={MMCV Contributors},\n    howpublished = {\\url{https://github.com/open-mmlab/mmcv}},\n    year={2018}\n}\n```\n\n## Contributing\n\nWe appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.\n\n## License\n\nMMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.\n\n## Branch Maintenance Plan\n\nMMCV currently has four branches, namely main, 1.x, master, and 2.x, where 2.x is an alias for the main branch, and master is an alias for the 1.x branch. The 2.x and master branches will be deleted in the future. MMCV's branches go through the following three stages:\n\n| Phase                | Time                  | Branch                                                                                                                              | description                                                                                                                                            |\n| -------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |\n| RC Period            | 2022.9.1 - 2023.4.5   | Release candidate code (2.x version) will be released on 2.x branch. Default master branch is still 1.x version                     | Master and 2.x branches iterate normally                                                                                                               |\n| Compatibility Period | 2023.4.6 - 2023.12.31 | **The 2.x branch has been renamed to the main branch and set as the default branch**, and 1.x branch will correspond to 1.x version | We still maintain the old version 1.x, respond to user needs, but try not to introduce changes that break compatibility; main branch iterates normally |\n| Maintenance Period   | From 2024/1/1         | Default main branch corresponds to 2.x version and 1.x branch is 1.x version                                                        | 1.x branch is in maintenance phase, no more new feature support; main branch is iterating normally                                                     |\n\n## Projects in OpenMMLab\n\n- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.\n- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.\n- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.\n- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.\n- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.\n- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.\n- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.\n- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.\n- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.\n- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.\n- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.\n- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.\n- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.\n- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.\n- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.\n- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.\n- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.\n- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.\n- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.\n- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.\n- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.\n"
  },
  {
    "path": "README_zh-CN.md",
    "content": "<div align=\"center\">\n  <img src=\"https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png\" width=\"300\"/>\n  <div>&nbsp;</div>\n  <div align=\"center\">\n    <b><font size=\"5\">OpenMMLab 官网</font></b>\n    <sup>\n      <a href=\"https://openmmlab.com\">\n        <i><font size=\"4\">HOT</font></i>\n      </a>\n    </sup>\n    &nbsp;&nbsp;&nbsp;&nbsp;\n    <b><font size=\"5\">OpenMMLab 开放平台</font></b>\n    <sup>\n      <a href=\"https://platform.openmmlab.com\">\n        <i><font size=\"4\">TRY IT OUT</font></i>\n      </a>\n    </sup>\n  </div>\n  <div>&nbsp;</div>\n\n[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)\n[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)\n[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)\n[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)\n[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)\n[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)\n[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)\n[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)\n\n[📘使用文档](https://mmcv.readthedocs.io/zh_CN/latest/) |\n[🛠️安装教程](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html) |\n[🤔报告问题](https://github.com/open-mmlab/mmcv/issues/new/choose)\n\n</div>\n\n<div align=\"center\">\n\n[English](README.md) | 简体中文\n\n</div>\n\n<div align=\"center\">\n  <a href=\"https://openmmlab.medium.com/\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png\" width=\"3%\" alt=\"\" /></a>\n  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n  <a href=\"https://discord.gg/raweFPmdzG\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png\" width=\"3%\" alt=\"\" /></a>\n  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n  <a href=\"https://twitter.com/OpenMMLab\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png\" width=\"3%\" alt=\"\" /></a>\n  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n  <a href=\"https://www.youtube.com/openmmlab\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png\" width=\"3%\" alt=\"\" /></a>\n  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n  <a href=\"https://space.bilibili.com/1293512903\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png\" width=\"3%\" alt=\"\" /></a>\n  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n  <a href=\"https://www.zhihu.com/people/openmmlab\" style=\"text-decoration:none;\">\n    <img src=\"https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png\" width=\"3%\" alt=\"\" /></a>\n</div>\n\n## Highlights\n\nOpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。\n\nMMCV v2.0.0 正式版本于 2023 年 4 月 6 日发布。在 2.x 版本中，它删除了和训练流程相关的组件，并新增了数据变换模块。另外，从 2.x 版本开始，重命名包名 **mmcv** 为 **mmcv-lite** 以及 **mmcv-full** 为 **mmcv**。详情见[兼容性文档](docs/zh_cn/compatibility.md)。\n\nMMCV 会同时维护 [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (对应原 [master](https://github.com/open-mmlab/mmcv/tree/master) 分支) 和 **2.x**（对应 **main** 分支，现为默认分支）版本，详情见[分支维护计划](README_zh-CN.md#分支维护计划)。\n\n## 简介\n\nMMCV 是一个面向计算机视觉的基础库，它提供了以下功能：\n\n- [图像和视频处理](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_process.html)\n- [图像和标注结果可视化](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/visualization.html)\n- [图像变换](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_transform.html)\n- [多种 CNN 网络结构](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/cnn.html)\n- [高质量实现的常见 CUDA 算子](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/ops.html)\n\nMMCV 支持多种平台，包括：\n\n- Linux\n- Windows\n- macOS\n\n如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。\n\n提示: MMCV 需要 Python 3.7 以上版本。\n\n## 安装\n\nMMCV 有两个版本：\n\n- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。\n- **mmcv-lite**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。\n\n**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。\n\n### 安装 mmcv\n\n在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本。\n\n安装 mmcv 的命令如下：\n\n```bash\npip install -U openmim\nmim install mmcv\n```\n\n如果需要指定 mmcv 的版本，可以使用以下命令\n\n```bash\nmim install mmcv==2.0.0\n```\n\n如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。\n\n<details>\n<summary>使用预编译包的安装日志</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv<br />\n<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>\n\n</details>\n\n<details>\n<summary>使用源码包的安装日志</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv==2.0.0<br />\n<b>Downloading mmcv-2.0.0.tar.gz</b>\n\n</details>\n\n更多安装方式请参考[安装文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)。\n\n### 安装 mmcv-lite\n\n如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。\n\n```bash\npip install -U openmim\nmim install mmcv-lite\n```\n\n## FAQ\n\n如果你遇到了安装问题或者运行时问题，请查看[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html)是否已有解决方案。如果问题仍然没有解决，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。\n\n## 贡献指南\n\n我们感谢所有的贡献者为改进和提升 MMCV 所作出的努力。请参考[贡献指南](CONTRIBUTING.md)来了解参与项目贡献的相关指引。\n\n## 许可证\n\n`MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。\n\n## 分支维护计划\n\nMMCV 目前有四个分支，分别是 main、1.x、master 和 2.x，其中 2.x 为 main 分支的别名，master 为 1.x 分支的别名，2.x 和 master 这两个分支在将来会被删除。MMCV 的分支经历以下三个阶段：\n\n| 阶段   | 时间                  | 分支                                                                  | 说明                                                                                                   |\n| ------ | --------------------- | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |\n| 公测期 | 2022.9.1 - 2023.4.5   | 公测版代码发布在 2.x 分支；默认主分支 master 仍对应 1.x 版本          | master 和 2.x 分支正常进行迭代                                                                         |\n| 兼容期 | 2023.4.6 - 2023.12.31 | **2.x 分支重命名为 main 分支并设置为默认分支**；1.x 分支对应 1.x 版本 | 保持对旧版本 1.x 的维护和开发，响应用户需求，但尽量不引进破坏旧版本兼容性的改动；main 分支正常进行迭代 |\n| 维护期 | 2024.1.1 - 待定       | 默认主分支 main 为 2.x 版本；1.x 分支对应 1.x 版本                    | 1.x 分支进入维护阶段，不再进行新功能支持；main 分支正常进行迭代                                        |\n\n## OpenMMLab 的其他项目\n\n- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库\n- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库\n- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口\n- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱\n- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱\n- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台\n- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准\n- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准\n- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱\n- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱\n- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱\n- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准\n- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准\n- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准\n- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准\n- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱\n- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台\n- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准\n- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱\n- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱\n- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架\n\n## 欢迎加入 OpenMMLab 社区\n\n扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。\n\n<div align=\"center\">\n<img src=\"https://user-images.githubusercontent.com/25839884/205870927-39f4946d-8751-4219-a4c0-740117558fd7.jpg\" height=\"400\" />  <img src=\"https://user-images.githubusercontent.com/25839884/203904835-62392033-02d4-4c73-a68c-c9e4c1e2b07f.jpg\" height=\"400\" /> <img src=\"https://user-images.githubusercontent.com/25839884/205872898-e2e6009d-c6bb-4d27-8d07-117e697a3da8.jpg\" height=\"400\" />\n</div>\n\n我们会在 OpenMMLab 社区为大家\n\n- 📢 分享 AI 框架的前沿核心技术\n- 💻 解读 PyTorch 常用模块源码\n- 📰 发布 OpenMMLab 的相关新闻\n- 🚀 介绍 OpenMMLab 开发的前沿算法\n- 🏃 获取更高效的问题答疑和意见反馈\n- 🔥 提供与各行各业开发者充分交流的平台\n\n干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬\n"
  },
  {
    "path": "TERMINOLOGY.md",
    "content": "# English-Chinese terminology comparison (英汉术语对照)\n\nThis document is used as a reference for English-Chinese terminology translation.\n\n该文档用作中英文翻译对照参考。\n\n|      English      |     中文     |\n| :---------------: | :----------: |\n|    annotation     |     标注     |\n|     backbone      |   主干网络   |\n|     benchmark     |   基准测试   |\n|    checkpoint     | 模型权重文件 |\n|    classifier     |    分类器    |\n|     cls_head      |    分类头    |\n|      decoder      |    解码器    |\n|     detector      |    检测器    |\n|      encoder      |    编码器    |\n|     finetune      |     微调     |\n|   ground truth    |   真实标签   |\n|       hook        |     钩子     |\n|     localizer     |    定位器    |\n|       neck        |   模型颈部   |\n|     pipeline      |    流水线    |\n|    recognizer     |    识别器    |\n|     register      |    注册器    |\n|     schedule      |     调整     |\n|     scheduler     |    调度器    |\n|     segmentor     |    分割器    |\n|      tensor       |     张量     |\n| training schedule |   训练策略   |\n"
  },
  {
    "path": "docker/README.md",
    "content": "# Docker images\n\nThere are two `Dockerfile` files to build docker images, one to build an image with the mmcv pre-built package and the other with the mmcv development environment.\n\n```text\n.\n|-- README.md\n|-- dev  # build with mmcv development environment\n|   `-- Dockerfile\n`-- release  # build with mmcv pre-built package\n    `-- Dockerfile\n```\n\n## Build docker images\n\n### Build with mmcv pre-built package\n\nBuild with local repository\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git && cd mmcv\ndocker build -t mmcv -f docker/release/Dockerfile .\n```\n\nOr build with remote repository\n\n```bash\ndocker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release\n```\n\nThe [Dockerfile](release/Dockerfile) installs latest released version of mmcv by default, but you can specify mmcv versions to install expected versions.\n\n```bash\ndocker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .\n```\n\nIf you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.\n\nAn example to build an image with PyTorch 1.11 and CUDA 11.3.\n\n```bash\ndocker build -t mmcv -f docker/release/Dockerfile \\\n    --build-arg PYTORCH=1.9.0 \\\n    --build-arg CUDA=11.1 \\\n    --build-arg CUDNN=8 \\\n    --build-arg MMCV=2.0.0rc1 .\n```\n\nMore available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).\n\n### Build with mmcv development environment\n\nIf you want to build an docker image with the mmcv development environment, you can use the following command\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git && cd mmcv\ndocker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .\n```\n\nNote that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).\n\nThe building process may take 10 minutes or more.\n\n## Run images\n\n```bash\ndocker run --gpus all --shm-size=8g -it mmcv\n```\n\nSee [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.\n"
  },
  {
    "path": "docker/dev/Dockerfile",
    "content": "ARG PYTORCH=\"1.8.1\"\nARG CUDA=\"10.2\"\nARG CUDNN=\"7\"\n\nFROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel\n\n# To fix GPG key error when running apt-get update\nRUN rm /etc/apt/sources.list.d/cuda.list \\\n    && rm /etc/apt/sources.list.d/nvidia-ml.list \\\n    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \\\n    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n\n# Install git and system dependencies for opencv-python\nRUN apt-get update && apt-get install -y git \\\n    && apt-get update && apt-get install -y libgl1 libglib2.0-0\n\n# Install system dependencies for unit tests\nRUN apt-get install -y ffmpeg libturbojpeg \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\n# build mmcv from source with develop mode\nARG HTTPS_PROXY=\"\"\nENV https_proxy=${HTTPS_PROXY}\nENV FORCE_CUDA=\"1\"\nARG CUDA_ARCH=\"\"\nENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}\nRUN git clone https://github.com/open-mmlab/mmcv.git /mmcv\nWORKDIR /mmcv\nRUN git checkout 2.x && git rev-parse --short HEAD\nRUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install\n"
  },
  {
    "path": "docker/release/Dockerfile",
    "content": "ARG PYTORCH=\"1.8.1\"\nARG CUDA=\"10.2\"\nARG CUDNN=\"7\"\n\nFROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel\n\n# To fix GPG key error when running apt-get update\nRUN rm /etc/apt/sources.list.d/cuda.list \\\n    && rm /etc/apt/sources.list.d/nvidia-ml.list \\\n    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \\\n    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub\n\n# Install system dependencies for opencv-python\nRUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\n# Install mmcv\nARG MMCV=\"\"\nRUN if [ \"${MMCV}\" = \"\" ]; then pip install -U openmim && mim install 'mmcv>=2.0.0rc1'; else pip install -U openmim && mim install mmcv==${MMCV}; fi\n\n# Verify the installation\nRUN python -c 'import mmcv;print(mmcv.__version__)'\n"
  },
  {
    "path": "docs/en/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/en/_static/css/readthedocs.css",
    "content": ".header-logo {\n    background-image: url(\"../image/mmcv-logo.png\");\n    background-size: 85px 40px;\n    height: 40px;\n    width: 85px;\n}\n\ntable.colwidths-auto td {\n    width: 50%\n}\n"
  },
  {
    "path": "docs/en/_static/version.json",
    "content": "{\n    \"Linux\": [\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.5\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.0\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        }\n    ],\n    \"Windows\": [\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.5\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        }\n    ],\n    \"macOS\": [\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"mps\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        }\n    ]\n}\n"
  },
  {
    "path": "docs/en/_templates/classtemplate.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n    :members:\n\n\n..\n  autogenerated from source/_templates/classtemplate.rst\n  note it does not have :inherited-members:\n"
  },
  {
    "path": "docs/en/api/arraymisc.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.arraymisc\n===================================\n\n.. contents:: mmcv.arraymisc\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.arraymisc\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   quantize\n   dequantize\n"
  },
  {
    "path": "docs/en/api/cnn.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.cnn\n===================================\n\n.. contents:: mmcv.cnn\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.cnn\n\nModule\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   ContextBlock\n   Conv2d\n   Conv3d\n   ConvAWS2d\n   ConvModule\n   ConvTranspose2d\n   ConvTranspose3d\n   ConvWS2d\n   DepthwiseSeparableConvModule\n   GeneralizedAttention\n   HSigmoid\n   HSwish\n   LayerScale\n   Linear\n   MaxPool2d\n   MaxPool3d\n   NonLocal1d\n   NonLocal2d\n   NonLocal3d\n   Scale\n   Swish\n   Conv2dRFSearchOp\n\nBuild Function\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   build_activation_layer\n   build_conv_layer\n   build_norm_layer\n   build_padding_layer\n   build_plugin_layer\n   build_upsample_layer\n\nMiscellaneous\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   fuse_conv_bn\n   conv_ws_2d\n   is_norm\n   make_res_layer\n   make_vgg_layer\n   get_model_complexity_info\n"
  },
  {
    "path": "docs/en/api/image.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.image\n===================================\n\n.. contents:: mmcv.image\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.image\n\nIO\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   imfrombytes\n   imread\n   imwrite\n   use_backend\n\nColor Space\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   bgr2gray\n   bgr2hls\n   bgr2hsv\n   bgr2rgb\n   bgr2ycbcr\n   gray2bgr\n   gray2rgb\n   hls2bgr\n   hsv2bgr\n   imconvert\n   rgb2bgr\n   rgb2gray\n   rgb2ycbcr\n   ycbcr2bgr\n   ycbcr2rgb\n\nGeometric\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   cutout\n   imcrop\n   imflip\n   impad\n   impad_to_multiple\n   imrescale\n   imresize\n   imresize_like\n   imresize_to_multiple\n   imrotate\n   imshear\n   imtranslate\n   rescale_size\n\nPhotometric\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   adjust_brightness\n   adjust_color\n   adjust_contrast\n   adjust_hue\n   adjust_lighting\n   adjust_sharpness\n   auto_contrast\n   clahe\n   imdenormalize\n   imequalize\n   iminvert\n   imnormalize\n   lut_transform\n   posterize\n   solarize\n\nMiscellaneous\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   tensor2imgs\n"
  },
  {
    "path": "docs/en/api/ops.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.ops\n===================================\n\n.. contents:: mmcv.ops\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.ops\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   BorderAlign\n   CARAFE\n   CARAFENaive\n   CARAFEPack\n   Conv2d\n   ConvTranspose2d\n   CornerPool\n   Correlation\n   CrissCrossAttention\n   DeformConv2d\n   DeformConv2dPack\n   DeformRoIPool\n   DeformRoIPoolPack\n   DynamicScatter\n   FusedBiasLeakyReLU\n   GroupAll\n   Linear\n   MaskedConv2d\n   MaxPool2d\n   ModulatedDeformConv2d\n   ModulatedDeformConv2dPack\n   ModulatedDeformRoIPoolPack\n   MultiScaleDeformableAttention\n   PSAMask\n   PointsSampler\n   PrRoIPool\n   QueryAndGroup\n   RiRoIAlignRotated\n   RoIAlign\n   RoIAlignRotated\n   RoIAwarePool3d\n   RoIPointPool3d\n   RoIPool\n   SAConv2d\n   SigmoidFocalLoss\n   SimpleRoIAlign\n   SoftmaxFocalLoss\n   SparseConv2d\n   SparseConv3d\n   SparseConvTensor\n   SparseConvTranspose2d\n   SparseConvTranspose3d\n   SparseInverseConv2d\n   SparseInverseConv3d\n   SparseMaxPool2d\n   SparseMaxPool3d\n   SparseModule\n   SparseSequential\n   SubMConv2d\n   SubMConv3d\n   SyncBatchNorm\n   TINShift\n   Voxelization\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   active_rotated_filter\n   assign_score_withk\n   ball_query\n   batched_nms\n   bbox_overlaps\n   border_align\n   box_iou_rotated\n   boxes_iou3d\n   boxes_iou_bev\n   boxes_overlap_bev\n   carafe\n   carafe_naive\n   chamfer_distance\n   contour_expand\n   convex_giou\n   convex_iou\n   deform_conv2d\n   deform_roi_pool\n   diff_iou_rotated_2d\n   diff_iou_rotated_3d\n   dynamic_scatter\n   furthest_point_sample\n   furthest_point_sample_with_dist\n   fused_bias_leakyrelu\n   gather_points\n   grouping_operation\n   knn\n   masked_conv2d\n   min_area_polygons\n   modulated_deform_conv2d\n   nms\n   nms3d\n   nms3d_normal\n   nms_bev\n   nms_match\n   nms_normal_bev\n   nms_rotated\n   pixel_group\n   point_sample\n   points_in_boxes_all\n   points_in_boxes_cpu\n   points_in_boxes_part\n   points_in_polygons\n   prroi_pool\n   rel_roi_point_to_rel_img_point\n   riroi_align_rotated\n   roi_align\n   roi_align_rotated\n   roi_pool\n   rotated_feature_align\n   scatter_nd\n   sigmoid_focal_loss\n   soft_nms\n   softmax_focal_loss\n   three_interpolate\n   three_nn\n   tin_shift\n   upfirdn2d\n   voxelization\n"
  },
  {
    "path": "docs/en/api/transforms.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.transforms\n===================================\n\n.. currentmodule:: mmcv.transforms\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   BaseTransform\n   TestTimeAug\n\nLoading\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   LoadAnnotations\n   LoadImageFromFile\n\nProcessing\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   CenterCrop\n   MultiScaleFlipAug\n   Normalize\n   Pad\n   RandomChoiceResize\n   RandomFlip\n   RandomGrayscale\n   RandomResize\n   Resize\n   ToTensor\n   ImageToTensor\n\nWrapper\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   Compose\n   KeyMapper\n   RandomApply\n   RandomChoice\n   TransformBroadcaster\n"
  },
  {
    "path": "docs/en/api/utils.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.utils\n===================================\n\n.. contents:: mmcv.utils\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.utils\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   IS_CUDA_AVAILABLE\n   IS_MLU_AVAILABLE\n   IS_MPS_AVAILABLE\n   collect_env\n   jit\n   skip_no_elena\n"
  },
  {
    "path": "docs/en/api/video.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.video\n===================================\n\n.. contents:: mmcv.video\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.video\n\nIO\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   VideoReader\n   Cache\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   frames2video\n\nOptical Flow\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   dequantize_flow\n   flow_from_bytes\n   flow_warp\n   flowread\n   flowwrite\n   quantize_flow\n   sparse_flow_from_bytes\n\nVideo Processing\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   concat_video\n   convert_video\n   cut_video\n   resize_video\n"
  },
  {
    "path": "docs/en/api/visualization.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.visualization\n===================================\n\n.. contents:: mmcv.visualization\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.visualization\n\nColor\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   Color\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   color_val\n\nImage\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   imshow\n   imshow_bboxes\n   imshow_det_bboxes\n\nOptical Flow\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   flow2rgb\n   flowshow\n   make_color_wheel\n"
  },
  {
    "path": "docs/en/community/contributing.md",
    "content": "## Contributing to OpenMMLab\n\nWelcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to\n\n**Fix bug**\n\nYou can directly post a Pull Request to fix typo in code or documents\n\nThe steps to fix the bug of code implementation are as follows.\n\n1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.\n\n2. Posting a pull request after fixing the bug and adding corresponding unit test.\n\n**New Feature or Enhancement**\n\n1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.\n2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.\n\n**Document**\n\nYou can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.\n\n### Pull Request Workflow\n\nIf you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)\n\n#### 1. Fork and clone\n\nIf you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png\" width=\"1200\">\n\nThen, you can clone the repositories to local:\n\n```shell\ngit clone git@github.com:{username}/mmcv.git\n```\n\nAfter that, you should ddd official repository as the upstream repository\n\n```bash\ngit remote add upstream git@github.com:open-mmlab/mmcv\n```\n\nCheck whether remote repository has been added successfully by `git remote -v`\n\n```bash\norigin\tgit@github.com:{username}/mmcv.git (fetch)\norigin\tgit@github.com:{username}/mmcv.git (push)\nupstream\tgit@github.com:open-mmlab/mmcv (fetch)\nupstream\tgit@github.com:open-mmlab/mmcv (push)\n```\n\n```{note}\nHere's a brief introduction to origin and upstream. When we use \"git clone\", we create an \"origin\" remote by default, which points to the repository cloned from. As for \"upstream\", we add it ourselves to point to the target repository. Of course, if you don't like the name \"upstream\", you could name it as you wish. Usually, we'll push the code to \"origin\". If the pushed code conflicts with the latest code in official(\"upstream\"), we should pull the latest code from upstream to resolve the conflicts, and then push to \"origin\" again. The posted Pull Request will be updated automatically.\n```\n\n#### 2. Configure pre-commit\n\nYou should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.\n\n```shell\npip install -U pre-commit\npre-commit install\n```\n\nCheck that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.\n\n```shell\npre-commit run --all-files\n```\n\n<img src=\"https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png\" width=\"1200\">\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png\" width=\"1200\">\n\n```{note}\nChinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml\n\npre-commit install -c .pre-commit-config-zh-cn.yaml\npre-commit run --all-files -c .pre-commit-config-zh-cn.yaml\n```\n\nIf the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.\n\nIf the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png\" width=\"1200\">\n\nIf we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**.\n\n```shell\ngit commit -m \"xxx\" --no-verify\n```\n\n#### 3. Create a development branch\n\nAfter configuring the pre-commit, we should create a branch based on the main branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`\n\n```shell\ngit checkout -b yhc/refactor_contributing_doc\n```\n\nIn subsequent development, if the main branch of the local repository is behind the main branch of \"upstream\", we need to pull the upstream for synchronization, and then execute the above command:\n\n```shell\ngit pull upstream main\n```\n\n#### 4. Commit the code and pass the unit test\n\n- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).\n\n- The committed code should pass through the unit test\n\n  ```shell\n  # Pass all unit tests\n  pytest tests\n\n  # Pass the unit test of runner\n  pytest tests/test_runner/test_runner.py\n  ```\n\n  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)\n\n- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)\n\n#### 5. Push the code to remote\n\nWe could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.\n\n```shell\ngit push -u origin {branch_name}\n```\n\nThis will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.\n\n#### 6. Create a Pull Request\n\n(1) Create a pull request in GitHub's Pull request interface\n\n<img src=\"https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png\" width=\"1200\">\n\n(2) Modify the PR description according to the guidelines so that other developers can better understand your changes\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png\" width=\"1200\">\n\nFind more details about Pull Request description in [pull request guidelines](#pr-specs).\n\n**note**\n\n(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)\n\n(b) If it is your first contribution, please sign the CLA\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png\" width=\"1200\">\n\n(c) Check whether the Pull Request pass through the CI\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png\" width=\"1200\">\n\nMMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.\n\n(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png\" width=\"1200\">\n\n#### 7. Resolve conflicts\n\nIf your local branch conflicts with the latest main branch of \"upstream\", you'll need to resolove them. There are two ways to do this:\n\n```shell\ngit fetch --all --prune\ngit rebase upstream/main\n```\n\nor\n\n```shell\ngit fetch --all --prune\ngit merge upstream/main\n```\n\nIf you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.\n\n### Guidance\n\n#### Unit test\n\nIf you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) module, you can try to install the following dependencies:\n\n```shell\n# Linux\nsudo apt-get update -y\nsudo apt-get install -y libturbojpeg\nsudo apt-get install -y ffmpeg\n\n# Windows\nconda install ffmpeg\n```\n\nWe should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:\n\n```shell\npython -m coverage run -m pytest /path/to/test_file\npython -m coverage html\n# check file in htmlcov/index.html\n```\n\n#### Document rendering\n\nIf the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:\n\n```shell\npip install -r requirements/docs.txt\ncd docs/zh_cn/\n# or docs/en\nmake html\n# check file in ./docs/zh_cn/_build/html/index.html\n```\n\n### Code style\n\n#### Python\n\nWe adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.\n\nWe use the following tools for linting and formatting:\n\n- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.\n- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.\n- [yapf](https://github.com/google/yapf): A formatter for Python files.\n- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.\n- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.\n- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.\n\nStyle configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).\n\nWe use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,\nfixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.\nThe config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).\n\n#### C++ and CUDA\n\nWe follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).\n\n### PR Specs\n\n1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style\n\n2. One short-time branch should be matched with only one PR\n\n3. Accomplish a detailed change in one PR. Avoid large PR\n\n   - Bad: Support Faster R-CNN\n   - Acceptable: Add a box head to Faster R-CNN\n   - Good: Add a parameter to box head to support custom conv-layer number\n\n4. Provide clear and significant commit message\n\n5. Provide clear and meaningful PR description\n\n   - Task name should be clarified in title. The general format is: \\[Prefix\\] Short description of the PR (Suffix)\n   - Prefix: add new feature \\[Feature\\], fix bug \\[Fix\\], related to documents \\[Docs\\], in developing \\[WIP\\] (which will not be reviewed temporarily)\n   - Introduce main changes, results and influences on other modules in short description\n   - Associate related issues and pull requests with a milestone\n"
  },
  {
    "path": "docs/en/community/pr.md",
    "content": "## Pull Request (PR)\n\nContent has been migrated to [contributing guidance](contributing.md).\n"
  },
  {
    "path": "docs/en/compatibility.md",
    "content": "### v2.0.0\n\nThe OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.\n\nThe OpenMMLab team released MMCV v2.0.0 on April 6, 2023. In the 2.x version, it has the following significant changes:\n\n(1) It removed the following components:\n\n- `mmcv.fileio` module, removed in PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179). FileIO module from mmengine will be used wherever required.\n- `mmcv.runner`, `mmcv.parallel`, `mmcv. engine` and `mmcv.device`, removed in PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216).\n- All classes in `mmcv.utils` (eg `Config` and `Registry`) and many functions, removed in PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Only a few functions related to mmcv are reserved.\n- `mmcv.onnx`, `mmcv.tensorrt` modules and related functions, removed in PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225).\n- Removed all root registrars in MMCV and registered classes or functions to the [root registrar](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py) in MMEngine.\n\n(2) It added the [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) data transformation module.\n\n(3) It renamed the package name **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv** in PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235). Also, change the default value of the environment variable `MMCV_WITH_OPS` from 0 to 1.\n\n<table class=\"docutils\">\n<thead>\n  <tr>\n    <th align=\"center\">MMCV < 2.0</th>\n    <th align=\"center\">MMCV >= 2.0 </th>\n<tbody>\n  <tr>\n  <td valign=\"top\">\n\n```bash\n# Contains ops, because the highest version of mmcv-full is less than 2.0.0, so there is no need to add version restrictions\npip install openmim\nmim install mmcv-full\n\n# do not contain ops\npip install openmim\nmim install \"mmcv < 2.0.0\"\n```\n\n</td>\n  <td valign=\"top\">\n\n```bash\n# Contains ops\npip install openmim\nmim install mmcv\n\n# Ops are not included, because the starting version of mmcv-lite is 2.0.0rc1, so there is no need to add version restrictions\npip install openmim\nmim install mmcv-lite\n```\n\n</td>\n</tr>\n</thead>\n</table>\n\n### v1.3.18\n\nSome ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example:\n\n```c++\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(rois);\n    CHECK_CUDA_INPUT(output);\n    CHECK_CUDA_INPUT(argmax_y);\n    CHECK_CUDA_INPUT(argmax_x);\n\n    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,\n                           aligned_height, aligned_width, spatial_scale,\n                           sampling_ratio, pool_mode, aligned);\n#else\n    AT_ERROR(\"RoIAlign is not compiled with GPU support\");\n#endif\n  } else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(rois);\n    CHECK_CPU_INPUT(output);\n    CHECK_CPU_INPUT(argmax_y);\n    CHECK_CPU_INPUT(argmax_x);\n    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n  }\n```\n\nRegistry and dispatcher are added to manage these implementations.\n\n```c++\n\nvoid ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned);\n\nvoid roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignForwardCUDAKernelLauncher(\n      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,\n      spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\n// register cuda implementation\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);\n\n// roi_align.cpp\n// use the dispatcher to invoke different implementation depending on device type of input tensors.\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,\n                       argmax_x, aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, pool_mode, aligned);\n}\n\n```\n\n### v1.3.11\n\nIn order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206).\n\nThe original directory was organized as follows.\n\n```\n.\n├── common_cuda_helper.hpp\n├── ops_cuda_kernel.cuh\n├── pytorch_cpp_helper.hpp\n├── pytorch_cuda_helper.hpp\n├── parrots_cpp_helper.hpp\n├── parrots_cuda_helper.hpp\n├── parrots_cudawarpfunction.cuh\n├── onnxruntime\n│   ├── onnxruntime_register.h\n│   ├── onnxruntime_session_options_config_keys.h\n│   ├── ort_mmcv_utils.h\n│   ├── ...\n│   ├── onnx_ops.h\n│   └── cpu\n│       ├── onnxruntime_register.cpp\n│       ├── ...\n│       └── onnx_ops_impl.cpp\n├── parrots\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_cuda.cu\n│   ├── ops_parrots.cpp\n│   └── ops_pytorch.h\n├── pytorch\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_cuda.cu\n│   ├── pybind.cpp\n└── tensorrt\n    ├── trt_cuda_helper.cuh\n    ├── trt_plugin_helper.hpp\n    ├── trt_plugin.hpp\n    ├── trt_serialize.hpp\n    ├── ...\n    ├── trt_ops.hpp\n    └── plugins\n        ├── trt_cuda_helper.cu\n        ├── trt_plugin.cpp\n        ├── ...\n        ├── trt_ops.cpp\n        └── trt_ops_kernel.cu\n```\n\nAfter refactored, it is organized as follows.\n\n```\n.\n├── common\n│   ├── box_iou_rotated_utils.hpp\n│   ├── parrots_cpp_helper.hpp\n│   ├── parrots_cuda_helper.hpp\n│   ├── pytorch_cpp_helper.hpp\n│   ├── pytorch_cuda_helper.hpp\n│   └── cuda\n│       ├── common_cuda_helper.hpp\n│       ├── parrots_cudawarpfunction.cuh\n│       ├── ...\n│       └── ops_cuda_kernel.cuh\n├── onnxruntime\n│   ├── onnxruntime_register.h\n│   ├── onnxruntime_session_options_config_keys.h\n│   ├── ort_mmcv_utils.h\n│   ├── ...\n│   ├── onnx_ops.h\n│   └── cpu\n│       ├── onnxruntime_register.cpp\n│       ├── ...\n│       └── onnx_ops_impl.cpp\n├── parrots\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_parrots.cpp\n│   └── ops_pytorch.h\n├── pytorch\n│   ├── info.cpp\n│   ├── pybind.cpp\n│   ├── ...\n│   ├── ops.cpp\n│   └── cuda\n│       ├── ...\n│       └── ops_cuda.cu\n└── tensorrt\n    ├── trt_cuda_helper.cuh\n    ├── trt_plugin_helper.hpp\n    ├── trt_plugin.hpp\n    ├── trt_serialize.hpp\n    ├── ...\n    ├── trt_ops.hpp\n    └── plugins\n        ├── trt_cuda_helper.cu\n        ├── trt_plugin.cpp\n        ├── ...\n        ├── trt_ops.cpp\n        └── trt_ops_kernel.cu\n```\n"
  },
  {
    "path": "docs/en/conf.py",
    "content": "#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a selection of the most common options. For a\n# full list see the documentation:\n# http://www.sphinx-doc.org/en/master/config\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport sys\n\nimport pytorch_sphinx_theme\nfrom sphinx.builders.html import StandaloneHTMLBuilder\n\nsys.path.insert(0, os.path.abspath('../..'))\n\nversion_file = '../../mmcv/version.py'\nwith open(version_file) as f:\n    exec(compile(f.read(), version_file, 'exec'))\n__version__ = locals()['__version__']\n\n# -- Project information -----------------------------------------------------\n\nproject = 'mmcv'\ncopyright = '2018-2022, OpenMMLab'\nauthor = 'MMCV Authors'\n\n# The short X.Y version\nversion = __version__\n# The full version, including alpha/beta/rc tags\nrelease = __version__\n\n# -- General configuration ---------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#\n# needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\n\nextensions = [\n    'sphinx.ext.autodoc',\n    'sphinx.ext.autosummary',\n    'sphinx.ext.intersphinx',\n    'sphinx.ext.napoleon',\n    'sphinx.ext.viewcode',\n    'sphinx_markdown_tables',\n    'myst_parser',\n    'sphinx_copybutton',\n]  # yapf: disable\n\nmyst_heading_anchors = 4\n\nmyst_enable_extensions = ['colon_fence']\n\n# Configuration for intersphinx\nintersphinx_mapping = {\n    'python': ('https://docs.python.org/3', None),\n    'numpy': ('https://numpy.org/doc/stable', None),\n    'torch': ('https://pytorch.org/docs/stable/', None),\n    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),\n}\n\nautodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\nsource_suffix = {\n    '.rst': 'restructuredtext',\n    '.md': 'markdown',\n}\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = None\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\n# html_theme = 'sphinx_rtd_theme'\nhtml_theme = 'pytorch_sphinx_theme'\nhtml_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\nhtml_theme_options = {\n    'menu': [\n        {\n            'name': 'GitHub',\n            'url': 'https://github.com/open-mmlab/mmcv'\n        },\n    ],\n    # Specify the language of shared menu\n    'menu_lang': 'en',\n}\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\nhtml_css_files = ['css/readthedocs.css']\n\n# Custom sidebar templates, must be a dictionary that maps document names\n# to template names.\n#\n# The default sidebars (for documents that don't match any pattern) are\n# defined by theme itself.  Builtin themes are using these templates by\n# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',\n# 'searchbox.html']``.\n#\n# html_sidebars = {}\n\n# -- Options for HTMLHelp output ---------------------------------------------\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'mmcvdoc'\n\n# -- Options for LaTeX output ------------------------------------------------\n\nlatex_elements = {\n    # The paper size ('letterpaper' or 'a4paper').\n    #\n    # 'papersize': 'letterpaper',\n\n    # The font size ('10pt', '11pt' or '12pt').\n    #\n    # 'pointsize': '10pt',\n\n    # Additional stuff for the LaTeX preamble.\n    #\n    # 'preamble': '',\n\n    # Latex figure (float) alignment\n    #\n    # 'figure_align': 'htbp',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',\n     'manual'),\n]\n\n# -- Options for manual page output ------------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]\n\n# -- Options for Texinfo output ----------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',\n     'One line description of project.', 'Miscellaneous'),\n]\n\n# -- Options for Epub output -------------------------------------------------\n\n# Bibliographic Dublin Core info.\nepub_title = project\n\n# The unique identifier of the text. This can be a ISBN number\n# or the project homepage.\n#\n# epub_identifier = ''\n\n# A unique identification for the text.\n#\n# epub_uid = ''\n\n# A list of files that should not be packed into the epub file.\nepub_exclude_files = ['search.html']\n\n# set priority when building html\nStandaloneHTMLBuilder.supported_image_types = [\n    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'\n]\n# -- Extension configuration -------------------------------------------------\n# Ignore >>> when copying code\ncopybutton_prompt_text = r'>>> |\\.\\.\\. '\ncopybutton_prompt_is_regexp = True\n"
  },
  {
    "path": "docs/en/deployment/mmcv_ops_definition.md",
    "content": "# MMCV Operators\n\nTo make custom operators in MMCV more standard, precise definitions of each operator are listed in this document.\n\n<!-- TOC -->\n\n- [MMCV Operators](#mmcv-operators)\n  - [MMCVBorderAlign](#mmcvborderalign)\n    - [Description](#description)\n    - [Parameters](#parameters)\n    - [Inputs](#inputs)\n    - [Outputs](#outputs)\n    - [Type Constraints](#type-constraints)\n  - [MMCVCARAFE](#mmcvcarafe)\n    - [Description](#description-1)\n    - [Parameters](#parameters-1)\n    - [Inputs](#inputs-1)\n    - [Outputs](#outputs-1)\n    - [Type Constraints](#type-constraints-1)\n  - [MMCVCAWeight](#mmcvcaweight)\n    - [Description](#description-2)\n    - [Parameters](#parameters-2)\n    - [Inputs](#inputs-2)\n    - [Outputs](#outputs-2)\n    - [Type Constraints](#type-constraints-2)\n  - [MMCVCAMap](#mmcvcamap)\n    - [Description](#description-3)\n    - [Parameters](#parameters-3)\n    - [Inputs](#inputs-3)\n    - [Outputs](#outputs-3)\n    - [Type Constraints](#type-constraints-3)\n  - [MMCVCornerPool](#mmcvcornerpool)\n    - [Description](#description-4)\n    - [Parameters](#parameters-4)\n    - [Inputs](#inputs-4)\n    - [Outputs](#outputs-4)\n    - [Type Constraints](#type-constraints-4)\n  - [MMCVDeformConv2d](#mmcvdeformconv2d)\n    - [Description](#description-5)\n    - [Parameters](#parameters-5)\n    - [Inputs](#inputs-5)\n    - [Outputs](#outputs-5)\n    - [Type Constraints](#type-constraints-5)\n  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)\n    - [Description](#description-6)\n    - [Parameters](#parameters-6)\n    - [Inputs](#inputs-6)\n    - [Outputs](#outputs-6)\n    - [Type Constraints](#type-constraints-6)\n  - [MMCVDeformRoIPool](#mmcvdeformroipool)\n    - [Description](#description-7)\n    - [Parameters](#parameters-7)\n    - [Inputs](#inputs-7)\n    - [Outputs](#outputs-7)\n    - [Type Constraints](#type-constraints-7)\n  - [MMCVMaskedConv2d](#mmcvmaskedconv2d)\n    - [Description](#description-8)\n    - [Parameters](#parameters-8)\n    - [Inputs](#inputs-8)\n    - [Outputs](#outputs-8)\n    - [Type Constraints](#type-constraints-8)\n  - [MMCVPSAMask](#mmcvpsamask)\n    - [Description](#description-9)\n    - [Parameters](#parameters-9)\n    - [Inputs](#inputs-9)\n    - [Outputs](#outputs-9)\n    - [Type Constraints](#type-constraints-9)\n  - [NonMaxSuppression](#nonmaxsuppression)\n    - [Description](#description-10)\n    - [Parameters](#parameters-10)\n    - [Inputs](#inputs-10)\n    - [Outputs](#outputs-10)\n    - [Type Constraints](#type-constraints-10)\n  - [MMCVRoIAlign](#mmcvroialign)\n    - [Description](#description-11)\n    - [Parameters](#parameters-11)\n    - [Inputs](#inputs-11)\n    - [Outputs](#outputs-11)\n    - [Type Constraints](#type-constraints-11)\n  - [MMCVRoIAlignRotated](#mmcvroialignrotated)\n    - [Description](#description-12)\n    - [Parameters](#parameters-12)\n    - [Inputs](#inputs-12)\n    - [Outputs](#outputs-12)\n    - [Type Constraints](#type-constraints-12)\n  - [grid_sampler\\*](#grid_sampler)\n    - [Description](#description-13)\n    - [Parameters](#parameters-13)\n    - [Inputs](#inputs-13)\n    - [Outputs](#outputs-13)\n    - [Type Constraints](#type-constraints-13)\n  - [cummax\\*](#cummax)\n    - [Description](#description-14)\n    - [Parameters](#parameters-14)\n    - [Inputs](#inputs-14)\n    - [Outputs](#outputs-14)\n    - [Type Constraints](#type-constraints-14)\n  - [cummin\\*](#cummin)\n    - [Description](#description-15)\n    - [Parameters](#parameters-15)\n    - [Inputs](#inputs-15)\n    - [Outputs](#outputs-15)\n    - [Type Constraints](#type-constraints-15)\n  - [Reminders](#reminders)\n\n<!-- TOC -->\n\n## MMCVBorderAlign\n\n### Description\n\nApplies `border_align` over the input feature based on predicted bboxes.\n\nFor each border line (e.g. top, left, bottom or right) of each box,\nborder_align does the following:\n\n- uniformly samples `pool_size`+1 positions on this line, involving the start and end points.\n- the corresponding features on these points are computed by bilinear interpolation.\n- max pooling over all the `pool_size`+1 positions are used for computing pooled feature.\n\nRead [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs/2007.11056) for more detailed information.\n\n### Parameters\n\n| Type  | Parameter   | Description                                                                         |\n| ----- | ----------- | ----------------------------------------------------------------------------------- |\n| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively</dd>\n<dt><tt>boxes</tt>: T</dt>\n<dd>Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Pooled features with shape [N,C,H*W,4]. The order is(top,left,bottom,right) for the last dimension.</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVCARAFE\n\n### Description\n\nCARAFE operator performs feature upsampling.\n\nRead [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) for more detailed information.\n\n### Parameters\n\n| Type    | Parameter      | Description                                   |\n| ------- | -------------- | --------------------------------------------- |\n| `int`   | `kernel_size`  | reassemble kernel size, should be odd integer |\n| `int`   | `group_size`   | reassemble group size                         |\n| `float` | `scale_factor` | upsample ratio(>=1)                           |\n\n### Inputs\n\n<dl>\n<dt><tt>features</tt>: T</dt>\n<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>\n<dt><tt>masks</tt>: T</dt>\n<dd>The input mask</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>The upsampled features. 4-D tensor of shape (N, C, H * scale_factor, W * scale_factor). N is the batch size.</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVCAWeight\n\n### Description\n\nOperator for Criss-Cross Attention\nRead [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.\n\n### Parameters\n\nNone\n\n### Inputs\n\n<dl>\n<dt><tt>t</tt>: T</dt>\n<dd>The query matrix of shape (N, C', H, W).</dd>\n<dt><tt>f</tt>: T</dt>\n<dd>The key matrix of shape (N, C', H, W).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>weight</tt>: T</dt>\n<dd>The attention map of shape (N, H+W-1, H, W).</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVCAMap\n\n### Description\n\nOperator for Criss-Cross Attention\nRead [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.\n\n### Parameters\n\nNone\n\n### Inputs\n\n<dl>\n<dt><tt>weight</tt>: T</dt>\n<dd>Output from the operator MMCVCAWeight.</dd>\n<dt><tt>value</tt>: T</dt>\n<dd>The value matrix of shape (N, C, H, W).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output tensor of aggregated contextual information</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVCornerPool\n\n### Description\n\nPerform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244) for more details.\n\n### Parameters\n\n| Type  | Parameter | Description                                                      |\n| ----- | --------- | ---------------------------------------------------------------- |\n| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>The pooled features. 4-D tensor of shape (N, C, H, W).</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVDeformConv2d\n\n### Description\n\nApplies a deformable 2D convolution over an input signal composed of several input planes.\n\nRead [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) for detail.\n\n### Parameters\n\n| Type           | Parameter           | Description                                                                                                       |\n| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- |\n| `list of ints` | `stride`            | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                              |\n| `list of ints` | `padding`           | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                         |\n| `list of ints` | `dilation`          | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                               |\n| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.            |\n| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                                     |\n| `int`          | `bias`              | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. |\n| `int`          | `im2col_step`       | Groups of deformable offset. Defaults to `32`.                                                                    |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>\n<dt><tt>offset</tt>: T</dt>\n<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW is the height and width of offset and output.</dd>\n<dt><tt>weight</tt>: T</dt>\n<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32, Linear)\n\n## MMCVModulatedDeformConv2d\n\n### Description\n\nPerform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail.\n\n### Parameters\n\n| Type           | Parameter           | Description                                                                           |\n| -------------- | ------------------- | ------------------------------------------------------------------------------------- |\n| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |\n| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |\n| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |\n| `int`          | `deformable_groups` | Groups of deformable offset.                                                          |\n| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |\n\n### Inputs\n\n<dl>\n<dt><tt>feature</tt>: T</dt>\n<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>\n<dt><tt>offset</tt>: T</dt>\n<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>\n<dt><tt>mask</tt>: T</dt>\n<dd>Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>\n<dt><tt>weight]</tt>: T</dt>\n<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>\n<dt><tt>bias</tt>: T, optional</dt>\n<dd>Input bias; 1-D tensor of shape (output_channel).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32, Linear)\n\n## MMCVDeformRoIPool\n\n### Description\n\nDeformable roi pooling layer\n\n### Parameters\n\n| Type    | Parameter        | Description                                                                                                   |\n| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |\n| `int`   | `output_height`  | height of output roi                                                                                          |\n| `int`   | `output_width`   | width of output roi                                                                                           |\n| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |\n| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |\n| `float` | `gamma`          | gamma                                                                                                         |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>\n<dt><tt>rois</tt>: T</dt>\n<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>\n<dt><tt>offset</tt>: T</dt>\n<dd>offset of height and width. Defaults to a tensor of zero</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>feat</tt>: T</dt>\n<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVMaskedConv2d\n\n### Description\n\nPerforms a masked 2D convolution from PixelRNN\nRead [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for more detailed information.\n\n### Parameters\n\n| Type           | Parameter | Description                                                                      |\n| -------------- | --------- | -------------------------------------------------------------------------------- |\n| `list of ints` | `stride`  | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** |\n| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.         |\n\n### Inputs\n\n<dl>\n<dt><tt>features</tt>: T</dt>\n<dd>Input features; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>\n<dt><tt>mask</tt>: T</dt>\n<dd>Input mask; 3D tensor of shape (N, H, W)</dd>\n<dt><tt>weight</tt>: T</dt>\n<dd>The learnable weights of the module</dd>\n<dt><tt>bias</tt>: T</dt>\n<dd>The learnable bias of the module</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>The output convolved feature</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVPSAMask\n\n### Description\n\nAn operator from PSANet.\n\nRead [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hszhao.github.io/papers/eccv18_psanet.pdf) for more detailed information.\n\n### Parameters\n\n| Type           | Parameter   | Description                                  |\n| -------------- | ----------- | -------------------------------------------- |\n| `int`          | `psa_type`  | `0` means collect and `1` means `distribute` |\n| `list of ints` | `mask_size` | The size of mask                             |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input feature; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output tensor of shape (N, H * W, H, W)</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## NonMaxSuppression\n\n### Description\n\nFilter out boxes has high IoU overlap with previously selected boxes or low score. Output the indices of valid boxes.\n\nNote this definition is slightly different with [onnx: NonMaxSuppression](https://github.com/onnx/onnx/blob/main/docs/Operators.md#nonmaxsuppression)\n\n### Parameters\n\n| Type    | Parameter                    | Description                                                                                                                          |\n| ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |\n| `int`   | `center_point_box`           | 0 - the box data is supplied as \\[y1, x1, y2, x2\\], 1-the box data is supplied as \\[x_center, y_center, width, height\\].             |\n| `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |\n| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \\[0, 1\\]. Default to 0.                   |\n| `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |\n| `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |\n\n### Inputs\n\n<dl>\n<dt><tt>boxes</tt>: T</dt>\n<dd>Input boxes. 3-D tensor of shape (num_batches, spatial_dimension, 4).</dd>\n<dt><tt>scores</tt>: T</dt>\n<dd>Input scores. 3-D tensor of shape (num_batches, num_classes, spatial_dimension).</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>indices</tt>: tensor(int32, Linear)</dt>\n<dd>Selected indices. 2-D tensor of shape (num_selected_indices, 3) as [[batch_index, class_index, box_index], ...].</dd>\n<dd>num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension).</dd>\n<dd>All invalid indices will be filled with -1.</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32, Linear)\n\n## MMCVRoIAlign\n\n### Description\n\nPerform RoIAlign on output feature, used in bbox_head of most two-stage detectors.\n\n### Parameters\n\n| Type    | Parameter        | Description                                                                                                   |\n| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |\n| `int`   | `output_height`  | height of output roi                                                                                          |\n| `int`   | `output_width`   | width of output roi                                                                                           |\n| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |\n| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |\n| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |\n| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>\n<dt><tt>rois</tt>: T</dt>\n<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>feat</tt>: T</dt>\n<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## MMCVRoIAlignRotated\n\n### Description\n\nPerform RoI align pooling for rotated proposals\n\n### Parameters\n\n| Type    | Parameter        | Description                                                                                                   |\n| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |\n| `int`   | `output_height`  | height of output roi                                                                                          |\n| `int`   | `output_width`   | width of output roi                                                                                           |\n| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |\n| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |\n| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |\n| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |\n| `int`   | `clockwise`      | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |\n\n### Inputs\n\n<dl>\n<dt><tt>features</tt>: T</dt>\n<dd>Input feature map; 4D tensor of shape (N, C, H, W)</dd>\n<dt><tt>rois</tt>: T</dt>\n<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## grid_sampler\\*\n\n### Description\n\nPerform sample from `input` with pixel locations from `grid`.\n\nCheck [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html?highlight=grid_sample#torch.nn.functional.grid_sample) for more information.\n\n### Parameters\n\n| Type  | Parameter            | Description                                                                                                                                                                                                                                                                                     |\n| ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`)                                                                                                                                                                                                                   |\n| `int` | `padding_mode`       | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`)                                                                                                                                                                                                                |\n| `int` | `align_corners`      | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>\n<dt><tt>grid</tt>: T</dt>\n<dd>Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW are the height and width of offset and output. </dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output feature; 4-D tensor of shape (N, C, outH, outW).</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32, Linear)\n\n## cummax\\*\n\n### Description\n\nReturns a tuple (`values`, `indices`) where `values` is the cumulative maximum elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`. Read [torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html) for more details.\n\n### Parameters\n\n| Type  | Parameter | Description                            |\n| ----- | --------- | -------------------------------------- |\n| `int` | `dim`     | the dimension to do the operation over |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output the cumulative maximum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>\n<dt><tt>indices</tt>: tensor(int64)</dt>\n<dd>Output the index location of each cumulative maximum value found in the dimension `dim`, with the same shape as `input`.</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## cummin\\*\n\n### Description\n\nReturns a tuple (`values`, `indices`) where `values` is the cumulative minimum elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`. Read [torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html) for more details.\n\n### Parameters\n\n| Type  | Parameter | Description                            |\n| ----- | --------- | -------------------------------------- |\n| `int` | `dim`     | the dimension to do the operation over |\n\n### Inputs\n\n<dl>\n<dt><tt>input</tt>: T</dt>\n<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>\n</dl>\n\n### Outputs\n\n<dl>\n<dt><tt>output</tt>: T</dt>\n<dd>Output the cumulative minimum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>\n<dt><tt>indices</tt>: tensor(int64)</dt>\n<dd>Output the index location of each cumulative minimum value found in the dimension `dim`, with the same shape as `input`.</dd>\n</dl>\n\n### Type Constraints\n\n- T:tensor(float32)\n\n## Reminders\n\n- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX.\n"
  },
  {
    "path": "docs/en/docutils.conf",
    "content": "[html writers]\ntable_style: colwidths-auto\n"
  },
  {
    "path": "docs/en/faq.md",
    "content": "## Frequently Asked Questions\n\nWe list some common troubles faced by many users and their corresponding solutions here.\nFeel free to enrich the list if you find any frequent issues and have ways to help others to solve them.\n\n### Installation\n\n- KeyError: \"xxx: 'yyy is not in the zzz registry'\"\n\n  The registry mechanism will be triggered only when the file of the module is imported.\n  So you need to import that file somewhere. More details can be found at [KeyError: \"MaskRCNN: 'RefineRoIHead is not in the models registry'\"](https://github.com/open-mmlab/mmdetection/issues/5974).\n\n- \"No module named 'mmcv.ops'\"; \"No module named 'mmcv.\\_ext'\"\n\n  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`\n  2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html)\n\n- \"invalid device function\" or \"no kernel image is available for execution\"\n\n  1. Check the CUDA compute capability of you GPU\n  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.\n  3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments\n\n- \"undefined symbol\" or \"cannot open xxx.so\"\n\n  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check\n     whether the CUDA/GCC runtimes are the same as those used for compiling mmcv\n  2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv\n  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment\n\n- \"RuntimeError: CUDA error: invalid configuration argument\"\n\n  This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)\n  and recompile mmcv.\n\n- \"RuntimeError: nms is not compiled with GPU support\"\n\n  This error is because your CUDA environment is not installed correctly.\n  You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.\n\n- \"Segmentation fault\"\n\n  1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \\< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause \"segmentation fault\" and simply changing it to GCC 5.4 could solve the problem\n  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results\n     ```shell\n     python -c 'import torch; print(torch.cuda.is_available())'\n     ```\n  3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command\n     ```shell\n     python -c 'import mmcv; import mmcv.ops'\n     ```\n  4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault`\n\n- \"libtorch_cuda_cu.so: cannot open shared object file\"\n\n  `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch.\n\n- \"fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!\"\n\n  If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `\"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2\\include\\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!\"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017.\n\n- \"error: member \"torch::jit::detail::ModulePolicy::all_slots\" may not be initialized\"\n\n  If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member \"torch::jit::detail::ModulePolicy::all_slots\" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member \"torch::jit::detail::AttributePolicy::all_slots\" may not be initialized](https://github.com/pytorch/pytorch/issues/39394).\n\n- \"error: a member with an in-class initializer must be const\"\n\n  If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `\"- torch/include\\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const\"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575).\n\n- \"error: member \"torch::jit::ProfileOptionalOp::Kind\" may not be initialized\"\n\n  If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\\torch/csrc/jit/ir/ir.h(1347): error: member \"torch::jit::ProfileOptionalOp::Kind\" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch:\n\n  - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\\torch/csrc/jit/ir/ir.h`\n  - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\\include\\pybind11\\cast.h`\n  - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\\torch/csrc/jit/api/module.h`\n\n  More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956).\n\n- Compatibility issue between MMCV and MMDetection; \"ConvWS is already registered in conv layer\"\n\n  Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation).\n\n### Usage\n\n- \"RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one\"\n\n  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582).\n  2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually\n\n- \"RuntimeError: Trying to backward through the graph a second time\"\n\n  `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379).\n"
  },
  {
    "path": "docs/en/get_started/api_reference.md",
    "content": "# API reference table\n\nDue to the removal of the `mmcv.fileio`, `mmcv.runner`, `mmcv.parallel`, `mmcv.engine`, `mmcv.device` modules, and all classes and most of the functions in the `mmcv.utils` module during the upgrade from MMCV v1.x to MMCV v2.x, which were removed at PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179), PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216), PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Therefore, we provide the following API reference table to make it easier to quickly find the migrated interfaces.\n\n## Related issues, PRs and discussions\n\n- [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216)\n- [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282)\n- [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934)\n- [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356)\n- [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594)\n\n## `mmcv.fileio`\n\n| MMCV                                              | MMCV URL                                                                              | MMEngine                                                    | MMEngine URL                                                                                   |\n| ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |\n| mmcv.fileio.file_client.BaseStorageBackend        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.base.BaseStorageBackend            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py              |\n| mmcv.fileio.file_client.CephBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             |                                                             |                                                                                                |\n| mmcv.fileio.file_client.PetrelBackend             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.petrel_backend.PetrelBackend       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py    |\n| mmcv.fileio.file_client.MemcachedBackend          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py |\n| mmcv.fileio.file_client.LmdbBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.lmdb_backend.LmdbBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py      |\n| mmcv.fileio.file_client.HardDiskBackend           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.HardDiskBackend                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |\n| mmcv.fileio.file_client.HTTPBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.http_backend.HTTPBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py      |\n| mmcv.fileio.file_client.FileClient                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.FileClient                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |\n| mmcv.fileio.io.load                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.load                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |\n| mmcv.fileio.io.dump                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.dump                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |\n| mmcv.fileio.io.\\_register_handler                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.\\_register_handler                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |\n| mmcv.fileio.io.register_handler                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.register_handler                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |\n| mmcv.fileio.parse.list_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.list_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |\n| mmcv.fileio.parse.dict_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.dict_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |\n| mmcv.fileio.handlers.base.BaseFileHandler         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py           | mmengine.fileio.handlers.base.BaseFileHandler               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py              |\n| mmcv.fileio.handlers.json_handler.set_default     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.set_default           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |\n| mmcv.fileio.handlers.json_handler.JsonHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.JsonHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |\n| mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py    |\n| mmcv.fileio.handlers.yaml_handler.YamlHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py   | mmengine.fileio.handlers.yaml_handler.YamlHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py      |\n\n## `mmcv.runner`\n\n| MMCV                                                                  | MMCV URL                                                                                    | MMEngine                                                                                                                                       | MMEngine URL                                                                                                                                                                                           |\n| --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |\n| mmcv.runner.hooks.logger.base.LoggerHook                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py             | mmengine.hooks.logger_hook.LoggerHook                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py                                                                                                                         |\n| mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py          | Similar: mmengine.visualization.vis_backend.ClearMLVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py          | Similar: mmengine.visualization.vis_backend.DVCLiveVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py           | Similar: mmengine.visualization.vis_backend.MLflowVisBackend                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py          | Similar: mmengine.visualization.vis_backend.NeptuneVisBackend                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.pavi.PaviLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.segmind.SegmindLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py          |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py      | Similar: mmengine.visualization.vis_backend.TensorboardVisBackend                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.text.TextLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.wandb.WandbLoggerHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py            | Similar: mmengine.visualization.vis_backend.WandbVisBackend                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.checkpoint.CheckpointHook                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py              | mmengine.hooks.checkpoint_hook.CheckpointHook                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.closure.ClosureHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py                 |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.ema.EMAHook                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py                     | mmengine.hooks.ema_hook.EMAHook                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py                                                                                                                            |\n| mmcv.runner.hooks.evaluation.EvalHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.evaluation.DistEvalHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.hook.HOOKS                                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.registry.root.HOOKS                                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.hooks.hook.Hook                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.hooks.hook.Hook                                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py                                                                                                                                |\n| mmcv.runner.hooks.iter_timer.IterTimerHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py              | mmengine.hooks.iter_timer_hook.IterTimerHook                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py                                                                                                                     |\n| mmcv.runner.hooks.lr_updater.LrUpdaterHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ConstantLR                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.StepLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.StepLR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ExponentialLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.PolyLR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.InvLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.get_position_from_periods                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.annealing_cos                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_annealing_cos                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.annealing_linear                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_annealing_linear                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.format_param                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_format_param                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.memory.EmptyCacheHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py                  | mmengine.hoos.empty_cache_hook.EmptyCacheHook                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py                                                                                                                    |\n| mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.StepMomentum                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.OptimizerHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | mmengine.optimizer.optimizer_wrapper.OptimWrapper                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py                                                                                                         |\n| mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.profiler.ProfilerHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py                | mmengine.hooks.profiler_hook.ProfilerHook                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py                                                                                                                       |\n| mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py            | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py                                                                                                                   |\n| mmcv.runner.hooks.sync_buffer.SyncbuffersHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py             | mmengine.hooks.sync_buffer_hook.SyncBufferHook                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py                                                                                                                    |\n| mmcv.runner.optimizer.builder.OPTIMIZERS                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.registry.root.OPTIMIZERS                                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.register_torch_optimizers               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.optim.optimizer.builder.register_torch_optimizers                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py                                                                                                                   |\n| mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.build_optimizer_constructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.build_optimizer                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.base_module.BaseModule                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.BaseModule                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.Sequential                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.Sequential                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.ModuleList                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleList                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.ModuleDict                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleDict                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_runner.BaseRunner                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py                   | mmengine.runner.runner.Runner                                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py                                                                                                                             |\n| mmcv.runner.builder.RUNNERS                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNERS                                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.builder.RUNNER_BUILDERS                                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNER_CONSTRUCTORS                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.builder.build_runner_constructor                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.builder.build_runner                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.checkpoint.ENV_MMCV_HOME                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_MMENGINE_HOME                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.DEFAULT_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_get_mmcv_home                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_get_mmengine_home                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_state_dict                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_state_dict                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_torchvision_models                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_torchvision_models                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_external_models                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_external_models                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_mmcls_models                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_mmcls_models                                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_deprecated_model_names                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_deprecated_model_names                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_process_mmcls_checkpoint                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_process_mmcls_checkpoint                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.CheckpointLoader                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.CheckpointLoader                                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_local                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_local                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_http                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_http                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_pavi                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_pavi                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_ceph                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_ceph                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_torchvision                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_torchvision                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_openmmlab                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_openmmlab                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_mmcls                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_mmcls                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_load_checkpoint                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_load_checkpoint                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_load_checkpoint_with_prefix                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_load_checkpoint_with_prefix                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_checkpoint                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.weights_to_cpu                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.weights_to_cpu                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_save_to_state_dict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_save_to_state_dict                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_state_dict                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_state_dict                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.save_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.save_checkpoint                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.default_coonstructor.DefaultRunnerConstructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py           |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_find_free_port                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_is_free_port                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.init_dist                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.init_dist                                                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_pytorch                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_pytorch                                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_mpi                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_mpi                                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_slurm                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_slurm                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.get_dist_info                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.get_dist_info                                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.master_only                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.master_only                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.allreduce_params                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_allreduce_coalesced                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.epoch_based_runner.EpochBasedRunner                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            | mmengine.runner.loops.EpochBasedTrainLoop                                                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |\n| mmcv.runner.epoch_based_runner.Runner                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.fp16_utils.cast_tensor_type                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.auto_fp16                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.force_fp32                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.wrap_fp16_model                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.patch_norm_fp32                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.patch_forward_method                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.LossScaler                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.iter_based_runner.IterLoader                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.iter_based_runner.IterBasedRunner                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             | mmengine.runner.loops.IterBasedTrainLoop                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |\n| mmcv.runner.log_buffer.LogBuffer                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py                    |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.priority.Priority                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runer.priority.Priority                                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |\n| mmcv.runner.priority.get_priority                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runner.priority.get_priority                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |\n| mmcv.runner.utils.get_host_info                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.utils.get_time_str                                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.utils.obj_from_dict                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                                |                                                                                                                                                                                                        |\n| mmcv.runner.utils.set_random_seed                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         | mmengine.runner.utils.set_random_seed                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py                                                                                                                              |\n\n## `mmcv.parallel`\n\n| MMCV                                                           | MMCV URL                                                                       | MMEngine                                                       | MMEngine URL                                                                              |\n| -------------------------------------------------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |\n| mmcv.parallel.\\_functions.scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |\n| mmcv.parallel.\\_functions.synchronize_stream                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |\n| mmcv.parallel.\\_functions.get_input_device                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |\n| mmcv.parallel.\\_functions.Scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                |                                                                                           |\n| mmcv.parallel.collate.collate                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py        |                                                                |                                                                                           |\n| mmcv.parallel.data_container.assert_tensor_type                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py |                                                                |                                                                                           |\n| mmcv.parallel.data_container.DataContainer                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | Similar: mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py |\n| mmcv.parallel.data_parallel.MMDataParallel                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py  |                                                                |                                                                                           |\n| mmcv.parallel.distributed.MMDistributedDataParallel            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |\n| mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |\n| mmcv.parallel.registry.MODULE_WRAPPERS                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py       | mmengine.registry.root.MODEL_WRAPPERS                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                |\n| mmcv.parallel.scatter_gather.scatter                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                |                                                                                           |\n| mmcv.parallel.scatter_gather.scatter_kwargs                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                |                                                                                           |\n| mmcv.parallel.utils.is_module_wrapper                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py          | mmengine.model.wrappers.utils.is_model_wrapper                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py         |\n\n## `mmcv.engine`\n\n| MMCV                                 | MMCV URL                                                           | MMEngine | MMEngine URL |\n| ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ |\n| mmcv.engine.test.single_gpu_test     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.multi_gpu_test      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n\n## `mmcv.device`\n\n| MMCV                                      | MMCV URL                                                                     | MMEngine                         | MMEngine URL                                                              |\n| ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- |\n| mmcv.device.ipu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu               |                                  |                                                                           |\n| mmcv.device.mlu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu               |                                  |                                                                           |\n| mmcv.device.mps                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps               |                                  |                                                                           |\n| mmcv.device.npu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu               |                                  |                                                                           |\n| mmcv.device.\\_functions.scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |\n| mmcv.device.\\_functions.Scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |\n| mmcv.device.scatter_gather.scatter        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |\n| mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |\n| mmcv.device.utils.get_device              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py          | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py |\n\n## `mmcv.utils`\n\n| MMCV                                                   | MMCV URL                                                                     | MMEngine                                                            | MMEngine URL                                                                                |\n| ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |\n| mmcv.utils.config.BASE_KEY                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.BASE_KEY                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DELETE_KEY                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DELETE_KEY                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DEPRECATION_KEY                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DEPRECATION_KEY                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.ConfigDict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.ConfigDict                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.add_args                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.add_args                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.Config                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.Config                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DictAction                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DictAction                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.device_type.is_ipu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |\n| mmcv.utils.device_type.IS_IPU_AVAILABLE                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |\n| mmcv.utils.device_type.is_mlu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mlu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.device_type.is_mps_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mps_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.device_type.is_npu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_npu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.hub.\\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.hub.\\_legacy_zip_load                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\\_legacy_zip_load                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.hub.load_url                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.load_url                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.logging.logger_initialized                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.logging.get_logger                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.logging.print_log                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.misc.\\_ntuple                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_ntuple                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_1tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_1tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_2tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_2tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_3tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_3tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_4tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_4tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_ntuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_ntuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_str                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_str                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.import_modules_from_strings            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.import_modules_from_strings                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.iter_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.iter_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.list_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.list_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.tuple_cast                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.tuple_cast                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_seq_of                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_seq_of                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_list_of                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_list_of                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_tuple_of                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_tuple_of                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.slice_list                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.slice_list                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.concat_list                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.concat_list                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.check_prerequisites                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.check_prerequisites                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.\\_check_py_package                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_check_py_package                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.\\_check_executable                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_check_executable                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.requires_package                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_package                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.requires_executable                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_executable                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.deprecated_api_warning                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.deprecated_api_warning                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_method_overridden                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_method_overridden                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.has_method                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.has_method                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.is_cuda_available           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py |                                                                     |                                                                                             |\n| mmcv.utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_cuda_home             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_cuda_home             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_conv                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_conv                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_dataloader            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_dataloader            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_extension             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_extension             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_pool                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_pool                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_norm                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_norm                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.path.is_filepath                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.is_filepath                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.fopen                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.fopen                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.check_file_exist                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.path.check_file_exist                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.mkdir_or_exist                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.mkdir_or_exist                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.symlink                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.symlink                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.scandir                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.scandir                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.find_vcs_root                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.find_vcs_root                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.progressbar.ProgressBar                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.ProgressBar                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_progress                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_progress                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.init_pool                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.init_pool                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_parallel_progress         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_parallel_progress                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_iter_progress             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_iter_progress                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.registry.build_from_cfg                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.build_functions.build_from_cfg                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py       |\n| mmcv.utils.registry.Registry                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.registry.Registry                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py              |\n| mmcv.utils.seed.worker_init_fn                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py            | mmengine.dataset.utils.worker_init_fn                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py                  |\n| mmcv.utils.testing.check_python_script                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.check_python_script                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.\\_any                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.\\_any                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_dict_contains_subset         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_contains_subset                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_attrs_equal                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_attrs_equal                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_dict_has_keys                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_has_keys                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_keys_equal                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_keys_equal                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_is_norm_layer                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_is_norm_layer                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_params_all_zeros             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_params_all_zeros                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.timer.TimerError                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.TimerError                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.Timer                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.Timer                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.\\_g_timers                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.\\_g_timers                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.check_time                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.check_time                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.torch_ops.\\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.\\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |\n| mmcv.utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |\n| mmcv.utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py           | mmengine.utils.dl_utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py           |\n| mmcv.utils.version_utils.digit_version                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.digit_version                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n| mmcv.utils.version_utils.\\_minimal_ext_cmd             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.\\_minimal_ext_cmd                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n| mmcv.utils.version_utils.get_git_hash                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.get_git_hash                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n\n## `mmcv.cnn`\n\n| MMCV                                         | MMCV URL                                                                 | MMEngine                                   | MMEngine URL                                                             |\n| -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ |\n| mmcv.cnn.utils.sync_bn.\\_BatchNormXd         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\\_BatchNormXd         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |\n| mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |\n\n## `mmcv.model_zoo`\n\n| MMCV                                 | MMCV URL                                                                            | MMEngine                           | MMEngine URL                                                                        |\n| ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- |\n| mmcv.model_zoo.deprecated.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json       | mmengine.hub.deprecated.json       | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json       |\n| mmcv.model_zoo.mmcls.json            | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json            | mmengine.hub.mmcls.json            | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json            |\n| mmcv.model_zoo.open_mmlab.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json       | mmengine.hub.openmmlab.json        | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json        |\n| mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json |\n"
  },
  {
    "path": "docs/en/get_started/build.md",
    "content": "## Build MMCV from source\n\n### Build mmcv\n\nBefore installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command\n\n```bash\npython -c 'import torch;print(torch.__version__)'\n```\n\nIf version information is output, then PyTorch is installed.\n\n```{note}\nIf you would like to use `opencv-python-headless` instead of `opencv-python`,\ne.g., in a minimum container environment or servers without GUI,\nyou can first install it before installing MMCV to skip the installation of `opencv-python`.\n```\n\n#### Build on Linux\n\n1. Clone the repo\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. Install `ninja` and `psutil` to speed up the compilation\n\n   ```bash\n   pip install -r requirements/optional.txt\n   ```\n\n3. Check the nvcc version (requires 9.2+. Skip if no GPU available.)\n\n   ```bash\n   nvcc --version\n   ```\n\n   If the above command outputs the following message, it means that the nvcc setting is OK, otherwise you need to set CUDA_HOME.\n\n   ```\n   nvcc: NVIDIA (R) Cuda compiler driver\n   Copyright (c) 2005-2020 NVIDIA Corporation\n   Built on Mon_Nov_30_19:08:53_PST_2020\n   Cuda compilation tools, release 11.2, V11.2.67\n   Build cuda_11.2.r11.2/compiler.29373293_0\n   ```\n\n   :::{note}\n   If you want to support ROCm, you can refer to [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) to install ROCm.\n   :::\n\n4. Check the gcc version (requires 5.4+)\n\n   ```bash\n   gcc --version\n   ```\n\n5. Start building (takes 10+ min)\n\n   ```bash\n   pip install -e . -v\n   ```\n\n6. Validate the installation\n\n   ```bash\n   python .dev_scripts/check_installation.py\n   ```\n\n   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.\n\n   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).\n\n#### Build on macOS\n\n```{note}\nIf you are using a mac with apple silicon chip, install the PyTorch 1.13+, otherwise you will encounter the problem in [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218).\n```\n\n1. Clone the repo\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. Install `ninja` and `psutil` to speed up the compilation\n\n   ```bash\n   pip install -r requirements/optional.txt\n   ```\n\n3. Start building\n\n   ```bash\n   MMCV_WITH_OPS=1 pip install -e .\n   ```\n\n4. Validate the installation\n\n   ```bash\n   python .dev_scripts/check_installation.py\n   ```\n\n   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.\n\n   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).\n\n#### Build on Windows\n\nBuilding MMCV on Windows is a bit more complicated than that on Linux.\nThe following instructions show how to get this accomplished.\n\n##### Prerequisite\n\nThe following software is required for building MMCV on windows.\nInstall them first.\n\n- [Git](https://git-scm.com/download/win)\n  - During installation, tick **add git to Path**.\n- [Visual Studio Community 2019](https://visualstudio.microsoft.com)\n  - A compiler for C++ and CUDA codes.\n- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)\n  - Official distributions of Python should work too.\n- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)\n  - Not required for building CPU version.\n  - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed.\n\n```{note}\nYou should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill.\n```\n\n##### Common steps\n\n1. Launch Anaconda prompt from Windows Start menu\n\n   Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.\n\n2. Create a new conda environment\n\n   ```powershell\n   (base) PS C:\\Users\\xxx> conda create --name mmcv python=3.7\n   (base) PS C:\\Users\\xxx> conda activate mmcv  # make sure to activate environment before any operation\n   ```\n\n3. Install PyTorch. Choose a version based on your need.\n\n   ```powershell\n   # CUDA version\n   (mmcv) PS C:\\Users\\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch\n   # CPU version\n   (mmcv) PS C:\\Users\\xxx> conda install install pytorch torchvision cpuonly -c pytorch\n   ```\n\n4. Clone the repo\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx> git clone https://github.com/open-mmlab/mmcv.git\n   (mmcv) PS C:\\Users\\xxx\\mmcv> cd mmcv\n   ```\n\n5. Install `ninja` and `psutil` to speed up the compilation\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> pip install -r requirements/optional.txt\n   ```\n\n6. Set up MSVC compiler\n\n   Set Environment variable, add `C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.27.29110\\bin\\Hostx86\\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> cl\n   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64\n   Copyright (C) Microsoft Corporation.   All rights reserved.\n\n   usage: cl [ option... ] filename... [ / link linkoption... ]\n   ```\n\n   For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\\x64` in the path.\n\n   You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.\n\n##### Build and install MMCV\n\nmmcv can be built in two ways:\n\n1. Full version (CPU ops)\n\n   Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.\n\n2. Full version (CUDA ops)\n\n   Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).\n\n###### CPU version\n\nBuild and install\n\n```powershell\n(mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py build_ext\n(mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py develop\n```\n\n###### GPU version\n\n1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> ls env:\n\n   Name                           Value\n   ----                           -----\n   CUDA_PATH                      C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\n   CUDA_PATH_V10_1                C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1\n   CUDA_PATH_V10_2                C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\n   ```\n\n   This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:CUDA_HOME = \"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\"\n   # OR\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:\n   ```\n\n2. Set CUDA target arch\n\n   ```shell\n   # Here you need to change to the target architecture corresponding to your GPU\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:TORCH_CUDA_ARCH_LIST=\"7.5\"\n   ```\n\n   :::{note}\n   Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> &\"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\extras\\demo_suite\\deviceQuery.exe\"\n   Device 0: \"NVIDIA GeForce GTX 1660 SUPER\"\n   CUDA Driver Version / Runtime Version          11.7 / 11.1\n   CUDA Capability Major/Minor version number:    7.5\n   ```\n\n   The 7.5 above indicates the target architecture. Note: You need to replace v10.2 with your CUDA version in the above command.\n   :::\n\n3. Build and install\n\n   ```powershell\n   # build\n   python setup.py build_ext # if success, cl will be launched to compile ops\n   # install\n   python setup.py develop\n   ```\n\n   ```{note}\n   If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation.\n   ```\n\n##### Validate installation\n\n```powershell\n(mmcv) PS C:\\Users\\xxx\\mmcv> python .dev_scripts/check_installation.py\n```\n\nIf no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.\nIf no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).\n\n### Build mmcv-lite\n\nIf you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).\n\n1. Clone the repo\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. Start building\n\n   ```bash\n   MMCV_WITH_OPS=0 pip install -e . -v\n   ```\n\n3. Validate installation\n\n   ```bash\n   python -c 'import mmcv;print(mmcv.__version__)'\n   ```\n\n### Build mmcv-full on Cambricon MLU Devices\n\n#### Install torch_mlu\n\n##### Option1: Install mmcv-full based on Cambricon docker image\n\nFirstly, install and pull Cambricon docker image (please email service@cambricon.com for the latest release docker):\n\n```bash\ndocker pull ${docker image}\n```\n\nRun and attach to the docker, [Install mmcv-full on MLU device](#install-mmcv\\-full-on-cambricon-mlu-device) and [make sure you've installed mmcv-full on MLU device successfully](#test-code)\n\n##### Option2: Install mmcv-full from compiling Cambricon PyTorch source code\n\nPlease email service@cambricon.com or contact with Cambricon engineers for a suitable version of CATCH package. After you get the suitable version of CATCH package, please follow the steps in ${CATCH-path}/CONTRIBUTING.md to install Cambricon PyTorch.\n\n#### Install mmcv-full on Cambricon MLU device\n\nClone the repo\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git\n```\n\nThe mlu-ops library will be downloaded to the default directory (mmcv/mlu-ops) while building MMCV. You can also set `MMCV_MLU_OPS_PATH` to an existing mlu-ops library before building as follows:\n\n```bash\nexport MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops\n```\n\nInstall mmcv-full\n\n```bash\ncd mmcv\nexport MMCV_WITH_OPS=1\nexport FORCE_MLU=1\npython setup.py install\n```\n\n#### Test Code\n\nAfter finishing previous steps, you can run the following python code to make sure that you've installed mmcv-full on MLU device successfully\n\n```python\nimport torch\nimport torch_mlu\nfrom mmcv.ops import sigmoid_focal_loss\nx = torch.randn(3, 10).mlu()\nx.requires_grad = True\ny = torch.tensor([1, 5, 3]).mlu()\nw = torch.ones(10).float().mlu()\noutput = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')\nprint(output)\n```\n"
  },
  {
    "path": "docs/en/get_started/installation.md",
    "content": "## Installation\n\nThere are two versions of MMCV:\n\n- **mmcv**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.\n- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\\<1.0.0. It is useful when you do not need those CUDA ops.\n\n```{warning}\nDo not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.\n```\n\n### Install mmcv\n\nBefore installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command\n\n```bash\npython -c 'import torch;print(torch.__version__)'\n```\n\nIf version information is output, then PyTorch is installed.\n\n#### Install with mim (recommended)\n\n[mim](https://github.com/open-mmlab/mim) is the package management tool for the OpenMMLab projects, which makes it easy to install mmcv\n\n```bash\npip install -U openmim\nmim install mmcv\n```\n\nIf you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](build.md).\n\n<details>\n<summary>Installation log using pre-built packages</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv<br />\n<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>\n\n</details>\n\n<details>\n<summary>Installation log using source packages</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv==2.0.0<br />\n<b>Downloading mmcv-2.0.0.tar.gz</b>\n\n</details>\n\nTo install a specific version of mmcv, for example, mmcv version 2.0.0, you can use the following command\n\n```bash\nmim install mmcv==2.0.0\n```\n\n:::{note}\nIf you would like to use `opencv-python-headless` instead of `opencv-python`,\ne.g., in a minimum container environment or servers without GUI,\nyou can first install it before installing MMCV to skip the installation of `opencv-python`.\n\nAlternatively, if it takes too long to install a dependency library, you can specify the pypi source\n\n```bash\nmim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n\n:::\n\nYou can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.\n\n#### Install with pip\n\nUse the following command to check the version of CUDA and PyTorch\n\n```bash\npython -c 'import torch;print(torch.__version__);print(torch.version.cuda)'\n```\n\nSelect the appropriate installation command depending on the type of system, CUDA version, PyTorch version, and MMCV version\n\n<html>\n<body>\n<style>\n    select {\n        /*z-index: 1000;*/\n        position: absolute;\n        top: 10px;\n        width: 6.7rem;\n    }\n    #select-container {\n        position: relative;\n        height: 30px;\n    }\n    #select-cmd {\n        background-color: #f5f6f7;\n        font-size: 14px;\n        margin-top: 20px;\n    }\n    /* 让每一个都间隔1.3rem */\n    #select-os {\n        /* left: 1.375rem; */\n        left: 0;\n    }\n    #select-cuda {\n        /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */\n        left: 8rem;\n    }\n    #select-torch {\n        /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */\n        left: 16rem;\n    }\n    #select-mmcv {\n        /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */\n        left: 24rem;\n    }\n</style>\n<div id=\"select-container\">\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.id))\"\n            onchange=\"changeOS(this.value)\"\n            id=\"select-os\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeCUDA(this.value)\"\n            id=\"select-cuda\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeTorch(this.value)\"\n            id=\"select-torch\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeMMCV(this.value)\"\n            id=\"select-mmcv\">\n    </select>\n</div>\n<pre id=\"select-cmd\"></pre>\n</body>\n<script>\n    // 各个select当前的值\n    let osVal, cudaVal, torchVal, mmcvVal;\n    function clickOutside(targetDom, handler) {\n        const clickHandler = (e) => {\n            if (!targetDom || targetDom.contains(e.target)) return;\n            handler?.();\n            document.removeEventListener('click', clickHandler, false);\n        };\n        document.addEventListener('click', clickHandler, false);\n    }\n    function changeMMCV(val) {\n        mmcvVal = val;\n        change(\"select-mmcv\");\n    }\n    function changeTorch(val) {\n        torchVal = val;\n        change(\"select-torch\");\n    }\n    function changeCUDA(val) {\n        cudaVal = val;\n        change(\"select-cuda\");\n    }\n    function changeOS(val) {\n        osVal = val;\n        change(\"select-os\");\n    }\n    // 控制size大小相关的几个方法\n    function handleSelectMouseDown(id) {\n        const dom = document.getElementById(id);\n        if (!dom) return;\n        const len = dom?.options?.length;\n        if (len >= 10) {\n            dom.size = 10;\n            dom.style.zIndex = 100;\n        }\n    }\n    function handleSelectClick() {\n        const selects = Array.from(document.getElementsByTagName(\"select\"));\n        selects.forEach(select => {\n            select.size = 1;\n        });\n    }\n    function handleSelectBlur(id) {\n        const dom = document.getElementById(id);\n        if (!dom) {\n            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1\n            handleSelectClick();\n            return;\n        }\n        dom.size = 1;\n        dom.style.zIndex = 1;\n    }\n    function changeCmd() {\n        const cmd = document.getElementById(\"select-cmd\");\n        let cmdString = \"pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html\";\n        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html\n        let cudaVersion;\n        if (cudaVal === \"cpu\" || cudaVal === \"mps\") {\n            cudaVersion = \"cpu\";\n        } else {\n            cudaVersion = `cu${cudaVal.split(\".\").join(\"\")}`;\n        }\n        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;\n        cmdString = cmdString.replace(\"{cu_version}\", cudaVersion).replace(\"{mmcv_version}\", mmcvVal).replace(\"{torch_version}\", torchVersion);\n        cmd.textContent = cmdString;\n    }\n    // string数组去重\n    function unique(arr) {\n        if (!arr || !Array.isArray(arr)) return [];\n        return [...new Set(arr)];\n    }\n    // 根据string数组生成option的DocumentFragment\n    function genOptionFragment(data, id) {\n        const name = id.includes(\"-\")? id.split(\"-\")[1] : id;\n        const fragment = new DocumentFragment();\n        data.forEach(option => {\n            const ele = document.createElement(\"option\");\n            let text = `${name} ${option}`;\n            if (name === \"os\" || option.toUpperCase() === \"CPU\" || option.toUpperCase() === \"MPS\") {\n                text = `${option}`;\n            }\n            ele.textContent = text;\n            // 添加value属性，方便下拉框选择时直接读到数据\n            ele.value = option;\n            // 添加点击事件监听\n            ele.addEventListener('click', handleSelectClick);\n            fragment.appendChild(ele);\n        });\n        return fragment;\n    }\n    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内\n    function findAndAppend(data, id) {\n        const fragment = genOptionFragment(data, id);\n        const dom = document.getElementById(id);\n        if (dom) dom.replaceChildren(fragment);\n    }\n    /**\n     * change方法的重点在于\n     * 1. 各个下拉框数据的联动\n     *      OS ==> cuda ==> torch ==> mmcv\n     * 2. 命令行的修改\n     */\n    function change(id) {\n        const order = [\"select-mmcv\", \"select-torch\", \"select-cuda\", \"select-os\"];\n        const idx = order.indexOf(id);\n        if (idx === -1) return;\n        const versionDetail = version[osVal];\n        if (idx >= 3) {\n            // 根据os修改cuda\n            let cuda = [];\n            versionDetail.forEach(v => {\n                cuda.push(v.cuda);\n            });\n            cuda = unique(cuda);\n            cudaVal = cuda[0];\n            findAndAppend(cuda, \"select-cuda\");\n        }\n        if (idx >= 2) {\n            // 根据cuda修改torch\n            const torch = [];\n            versionDetail.forEach(v => {\n                if (v.cuda === cudaVal) torch.push(v.torch);\n            });\n            torchVal = torch[0];\n            findAndAppend(torch, \"select-torch\");\n        }\n        if (idx >= 1) {\n            // 根据torch修改mmcv\n            let mmcv = [];\n            versionDetail.forEach(v => {\n                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;\n            });\n            mmcvVal = mmcv[0];\n            findAndAppend(mmcv, \"select-mmcv\");\n        }\n        changeCmd();\n    }\n    // 初始化，处理version数据，并调用findAndAppend\n    function init() {\n        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底\n        // document.addEventListener(\"click\", handleSelectBlur);\n        const version = window.version;\n        // OS\n        const os = Object.keys(version);\n        osVal = os[0];\n        findAndAppend(os, \"select-os\");\n        change(\"select-os\");\n        changeCmd();\n    }\n    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题\n    window.onload = function () {\n        const url = \"../_static/version.json\"\n        // 申明一个XMLHttpRequest\n        const request = new XMLHttpRequest();\n        // 设置请求方法与路径\n        request.open(\"get\", url);\n        // 不发送数据到服务器\n        request.send(null);\n        //XHR对象获取到返回信息后执行\n        request.onload = function () {\n            // 返回状态为200，即为数据获取成功\n            if (request.status !== 200) return;\n            const data = JSON.parse(request.responseText);\n            window.version = data;\n            init();\n        }\n    }\n</script>\n</html>\n\nIf you do not find a corresponding version in the dropdown box above, you probably do not have a pre-built package corresponding to the PyTorch or CUDA or mmcv version, at which point you can [build mmcv from source](build.md).\n\n:::{note}\nmmcv is only compiled on PyTorch 1.x.0 because the compatibility\nusually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you\ncan install mmcv compiled with PyTorch 1.x.0 and it usually works well.\nFor example, if your PyTorch version is 1.8.1, you can feel free to choose 1.8.x.\n:::\n\n:::{note}\nIf you would like to use `opencv-python-headless` instead of `opencv-python`,\ne.g., in a minimum container environment or servers without GUI,\nyou can first install it before installing MMCV to skip the installation of `opencv-python`.\n\nAlternatively, if it takes too long to install a dependency library, you can specify the pypi source\n\n```bash\nmim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n\n:::\n\nYou can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv after running the installation commands.\n\n#### Using mmcv with Docker\n\nBuild with local repository\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git && cd mmcv\ndocker build -t mmcv -f docker/release/Dockerfile .\n```\n\nOr build with remote repository\n\n```bash\ndocker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release\n```\n\nThe [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.\n\n```bash\ndocker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .\n```\n\nIf you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.\n\nAn example to build an image with PyTorch 1.11 and CUDA 11.3.\n\n```bash\ndocker build -t mmcv -f docker/release/Dockerfile \\\n    --build-arg PYTORCH=1.11.0 \\\n    --build-arg CUDA=11.3 \\\n    --build-arg CUDNN=8 \\\n    --build-arg MMCV=2.0.0 .\n```\n\nMore available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).\n\n### Install mmcv-lite\n\nIf you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).\n\n```python\npip install mmcv-lite\n```\n"
  },
  {
    "path": "docs/en/get_started/introduction.md",
    "content": "## Introduction\n\nMMCV is a foundational library for computer vision research and provides the following functionalities.\n\n- [Image/Video processing](../understand_mmcv/data_process.md)\n- [Image and annotation visualization](../understand_mmcv/visualization.md)\n- [Image transformation](../understand_mmcv/data_transform.md)\n- [Various CNN architectures](../understand_mmcv/cnn.md)\n- [High-quality implementation of common CUDA ops](../understand_mmcv/ops.md)\n\nIt supports the following systems:\n\n- Linux\n- Windows\n- macOS\n\nIt supports many research projects as below:\n\n- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.\n- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.\n- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.\n- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.\n- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.\n- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.\n- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.\n- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.\n- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.\n- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.\n- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.\n- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.\n- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.\n- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.\n- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.\n- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.\n- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.\n- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.\n"
  },
  {
    "path": "docs/en/get_started/previous_versions.md",
    "content": "## OTHER VERSIONS OF PYTORCH BUILT FOR MMCV-FULL\n\nWe no longer provide `mmcv-full` packages compiled under lower versions of `PyTorch`, but for your convenience, you can find them below.\n\n### PyTorch 1.4\n\n| 1.0.0 \\<= mmcv_version \\<= 1.2.1\n\n#### CUDA 10.1\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html\n```\n\n#### CUDA 9.2\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html\n```\n\n#### CPU\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html\n```\n\n### PyTorch v1.3\n\n| 1.0.0 \\<= mmcv_version \\<= 1.3.16\n\n#### CUDA 10.1\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html\n```\n\n#### CUDA 9.2\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html\n```\n\n#### CPU\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html\n```\n"
  },
  {
    "path": "docs/en/index.rst",
    "content": "Welcome to MMCV's documentation!\n================================\n\nYou can switch between Chinese and English documents in the lower-left corner of the layout.\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Get Started\n\n   get_started/introduction.md\n   get_started/installation.md\n   get_started/build.md\n   get_started/api_reference.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Understand MMCV\n\n   understand_mmcv/data_process.md\n   understand_mmcv/data_transform.md\n   understand_mmcv/visualization.md\n   understand_mmcv/cnn.md\n   understand_mmcv/ops.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Deployment\n\n   deployment/mmcv_ops_definition.md\n\n.. toctree::\n   :caption: Switch Language\n\n   switch_language.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Compatibility\n\n   compatibility.md\n\n.. toctree::\n\n   faq.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Community\n\n   community/contributing.md\n   community/pr.md\n\n.. toctree::\n   :maxdepth: 1\n   :caption: API Reference\n\n   mmcv.image <api/image>\n   mmcv.video <api/video>\n   mmcv.visualization <api/visualization>\n   mmcv.cnn <api/cnn>\n   mmcv.ops <api/ops>\n   mmcv.transforms <api/transforms>\n   mmcv.arraymisc <api/arraymisc>\n   mmcv.utils <api/utils>\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`search`\n"
  },
  {
    "path": "docs/en/make.bat",
    "content": "@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset SOURCEDIR=.\nset BUILDDIR=_build\n\nif \"%1\" == \"\" goto help\n\n%SPHINXBUILD% >NUL 2>NUL\nif errorlevel 9009 (\n\techo.\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\n\techo.installed, then set the SPHINXBUILD environment variable to point\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\n\techo.may add the Sphinx directory to PATH.\n\techo.\n\techo.If you don't have Sphinx installed, grab it from\n\techo.http://sphinx-doc.org/\n\texit /b 1\n)\n\n%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%\ngoto end\n\n:help\n%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%\n\n:end\npopd\n"
  },
  {
    "path": "docs/en/switch_language.md",
    "content": "## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>\n\n## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>\n"
  },
  {
    "path": "docs/en/understand_mmcv/cnn.md",
    "content": "## CNN\n\nWe provide some building bricks for CNNs, including layer building, module bundles and weight initialization.\n\n### Layer building\n\nWe may need to try different layers of the same type when running experiments,\nbut do not want to modify the code from time to time.\nHere we provide some layer building methods to construct layers from a dict,\nwhich can be written in configs or specified via command line arguments.\n\n#### Usage\n\nA simplest example is\n\n```python\nfrom mmcv.cnn import build_conv_layer\n\ncfg = dict(type='Conv3d')\nlayer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)\n```\n\n- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).\n- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).\n- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.\n- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.\n- `build_padding_layer`: Supported types are zero, reflect, replicate.\n\n#### Extension\n\nWe also allow extending the building methods with custom layers and operators.\n\n1. Write and register your own module.\n\n   ```python\n   from mmengine.registry import MODELS\n\n   @MODELS.register_module()\n   class MyUpsample:\n\n       def __init__(self, scale_factor):\n           pass\n\n       def forward(self, x):\n           pass\n   ```\n\n2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.\n\n   ```python\n   from mmcv.cnn import build_upsample_layer\n\n   cfg = dict(type='MyUpsample', scale_factor=2)\n   layer = build_upsample_layer(cfg)\n   ```\n\n### Module bundles\n\nWe also provide common module bundles to facilitate the network construction.\n`ConvModule` is a bundle of convolution, normalization and activation layers,\nplease refer to the [api](api.html#mmcv.cnn.ConvModule) for details.\n\n```python\nfrom mmcv.cnn import ConvModule\n\n# conv + bn + relu\nconv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n# conv + gn + relu\nconv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))\n# conv + relu\nconv = ConvModule(3, 8, 2)\n# conv\nconv = ConvModule(3, 8, 2, act_cfg=None)\n# conv + leaky relu\nconv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))\n# bn + conv + relu\nconv = ConvModule(\n    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))\n```\n"
  },
  {
    "path": "docs/en/understand_mmcv/data_process.md",
    "content": "## Data Process\n\n### Image\n\nThis module provides some image processing methods, which requires `opencv` to be installed first.\n\n#### Read/Write/Show\n\nTo read or write images files, use `imread` or `imwrite`.\n\n```python\nimport mmcv\n\nimg = mmcv.imread('test.jpg')\nimg = mmcv.imread('test.jpg', flag='grayscale')\nimg_ = mmcv.imread(img)  # nothing will happen, img_ = img\nmmcv.imwrite(img, 'out.jpg')\n```\n\nTo read images from bytes\n\n```python\nwith open('test.jpg', 'rb') as f:\n    data = f.read()\nimg = mmcv.imfrombytes(data)\n```\n\nTo show an image file or a loaded image\n\n```python\nmmcv.imshow('tests/data/color.jpg')\n# this is equivalent to\n\nfor i in range(10):\n    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)\n    mmcv.imshow(img, win_name='test image', wait_time=200)\n```\n\n#### Color space conversion\n\nSupported conversion methods:\n\n- bgr2gray\n- gray2bgr\n- bgr2rgb\n- rgb2bgr\n- bgr2hsv\n- hsv2bgr\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\nimg1 = mmcv.bgr2rgb(img)\nimg2 = mmcv.rgb2gray(img1)\nimg3 = mmcv.bgr2hsv(img)\n```\n\n#### Resize\n\nThere are three resize methods. All `imresize_*` methods have an argument `return_scale`,\nif this argument is `False`, then the return value is merely the resized image, otherwise\nis a tuple `(resized_img, scale)`.\n\n```python\n# resize to a given size\nmmcv.imresize(img, (1000, 600), return_scale=True)\n\n# resize to the same size of another image\nmmcv.imresize_like(img, dst_img, return_scale=False)\n\n# resize by a ratio\nmmcv.imrescale(img, 0.5)\n\n# resize so that the max edge no longer than 1000, short edge no longer than 800\n# without changing the aspect ratio\nmmcv.imrescale(img, (1000, 800))\n```\n\n#### Rotate\n\nTo rotate an image by some angle, use `imrotate`. The center can be specified,\nwhich is the center of original image by default. There are two modes of rotating,\none is to keep the image size unchanged so that some parts of the image will be\ncropped after rotating, the other is to extend the image size to fit the rotated\nimage.\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# rotate the image clockwise by 30 degrees.\nimg_ = mmcv.imrotate(img, 30)\n\n# rotate the image counterclockwise by 90 degrees.\nimg_ = mmcv.imrotate(img, -90)\n\n# rotate the image clockwise by 30 degrees, and rescale it by 1.5x at the same time.\nimg_ = mmcv.imrotate(img, 30, scale=1.5)\n\n# rotate the image clockwise by 30 degrees, with (100, 100) as the center.\nimg_ = mmcv.imrotate(img, 30, center=(100, 100))\n\n# rotate the image clockwise by 30 degrees, and extend the image size.\nimg_ = mmcv.imrotate(img, 30, auto_bound=True)\n```\n\n#### Flip\n\nTo flip an image, use `imflip`.\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# flip the image horizontally\nmmcv.imflip(img)\n\n# flip the image vertically\nmmcv.imflip(img, direction='vertical')\n```\n\n#### Crop\n\n`imcrop` can crop the image with one or more regions. Each region is represented by the upper left and lower right coordinates as (x1, y1, x2, y2).\n\n```python\nimport mmcv\nimport numpy as np\n\nimg = mmcv.imread('tests/data/color.jpg')\n\n# crop the region (10, 10, 100, 120)\nbboxes = np.array([10, 10, 100, 120])\npatch = mmcv.imcrop(img, bboxes)\n\n# crop two regions (10, 10, 100, 120) and (0, 0, 50, 50)\nbboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])\npatches = mmcv.imcrop(img, bboxes)\n\n# crop two regions, and rescale the patches by 1.2x\npatches = mmcv.imcrop(img, bboxes, scale=1.2)\n```\n\n#### Padding\n\nThere are two methods, `impad` and `impad_to_multiple`, to pad an image to the\nspecific size with given values.\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# pad the image to (1000, 1200) with all zeros\nimg_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)\n\n# pad the image to (1000, 1200) with different values for three channels.\nimg_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))\n\n# pad the image on left, right, top, bottom borders with all zeros\nimg_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)\n\n# pad the image on left, right, top, bottom borders with different values\n# for three channels.\nimg_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))\n\n# pad an image so that each edge is a multiple of some value.\nimg_ = mmcv.impad_to_multiple(img, 32)\n```\n\n### Video\n\nThis module provides the following functionalities:\n\n- A `VideoReader` class with friendly apis to read and convert videos.\n- Some methods for editing (cut, concat, resize) videos.\n- Optical flow read/write/warp.\n\n#### VideoReader\n\nThe `VideoReader` class provides sequence like apis to access video frames.\nIt will internally cache the frames which have been visited.\n\n```python\nvideo = mmcv.VideoReader('test.mp4')\n\n# obtain basic information\nprint(len(video))\nprint(video.width, video.height, video.resolution, video.fps)\n\n# iterate over all frames\nfor frame in video:\n    print(frame.shape)\n\n# read the next frame\nimg = video.read()\n\n# read a frame by index\nimg = video[100]\n\n# read some frames\nimg = video[5:10]\n```\n\nTo convert a video to images or generate a video from a image directory.\n\n```python\n# split a video into frames and save to a folder\nvideo = mmcv.VideoReader('test.mp4')\nvideo.cvt2frames('out_dir')\n\n# generate video from frames\nmmcv.frames2video('out_dir', 'test.avi')\n```\n\n#### Editing utils\n\nThere are also some methods for editing videos, which wraps the commands of ffmpeg.\n\n```python\n# cut a video clip\nmmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')\n\n# join a list of video clips\nmmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')\n\n# resize a video with the specified size\nmmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))\n\n# resize a video with a scaling ratio of 2\nmmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)\n```\n\n#### Optical flow\n\n`mmcv` provides the following methods to operate on optical flows.\n\n- IO\n- Visualization\n- Flow warping\n\nWe provide two options to dump optical flow files: uncompressed and compressed.\nThe uncompressed way just dumps the floating numbers to a binary file. It is\nlossless but the dumped file has a larger size.\nThe compressed way quantizes the optical flow to 0-255 and dumps it as a\njpeg image. The flow of x-dim and y-dim will be concatenated into a single image.\n\n1. IO\n\n```python\nflow = np.random.rand(800, 600, 2).astype(np.float32)\n# dump the flow to a flo file (~3.7M)\nmmcv.flowwrite(flow, 'uncompressed.flo')\n# dump the flow to a jpeg file (~230K)\n# the shape of the dumped image is (800, 1200)\nmmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)\n\n# read the flow file, the shape of loaded flow is (800, 600, 2) for both ways\nflow = mmcv.flowread('uncompressed.flo')\nflow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)\n```\n\n2. Visualization\n\nIt is possible to visualize optical flows with `mmcv.flowshow()`.\n\n```python\nmmcv.flowshow(flow)\n```\n\n![progress](../_static/flow_visualization.png)\n\n3. Flow warping\n\n```python\nimg1 = mmcv.imread('img1.jpg')\nflow = mmcv.flowread('flow.flo')\nwarped_img2 = mmcv.flow_warp(img1, flow)\n```\n\nimg1 (left) and img2 (right)\n\n![raw images](../_static/flow_raw_images.png)\n\noptical flow (img2 -> img1)\n\n![optical flow](../_static/flow_img2toimg1.png)\n\nwarped image and difference with ground truth\n\n![warped image](../_static/flow_warp_diff.png)\n"
  },
  {
    "path": "docs/en/understand_mmcv/data_transform.md",
    "content": "# Data Transformation\n\nIn the OpenMMLab algorithm library, dataset construction and data preparation are decoupled. Usually, the construction of the dataset only parses the dataset and records the basic information of each sample, while the data preparation is a series of data transformations including data loading, preprocessing, formatting, and other operations performed according to the basic information of the sample.\n\n## Design of data transformation\n\nIn MMCV, we use various callable data transformation classes to manipulate data. These data transformation classes can accept several configuration parameters for the instantiation and then process the input data dictionary by `__call__` method. All data transformation methods accept a dictionary as the input and produce the output as a dictionary as well. A simple example is as follows:\n\n```python\n>>> import numpy as np\n>>> from mmcv.transforms import Resize\n>>>\n>>> transform = Resize(scale=(224, 224))\n>>> data_dict = {'img': np.random.rand(256, 256, 3)}\n>>> data_dict = transform(data_dict)\n>>> print(data_dict['img'].shape)\n(224, 224, 3)\n```\n\nThe data transformation class reads some fields of the input dictionary and may add or update some fields. The keys of these fields are mostly fixed. For example, `Resize` will always read fields such as `\"img\"` in the input dictionary. More information about the conventions for input and output fields could be found in the documentation of the corresponding class.\n\n```{note}\nBy convention, the order of image shape which is used as **initialization parameters** in data transformation (such as Resize, Pad) is (width, height). In the dictionary returned by the data transformation, the image related shape, such as `img_shape`, `ori_shape`, `pad_shape`, etc., is (height, width).\n```\n\nMMCV provides a unified base class called `BaseTransform` for all data transformation classes:\n\n```python\nclass BaseTransform(metaclass=ABCMeta):\n\n    def __call__(self, results: dict) -> dict:\n\n        return self.transform(results)\n\n    @abstractmethod\n    def transform(self, results: dict) -> dict:\n        pass\n```\n\nAll data transformation classes must inherit `BaseTransform` and implement the `transform` method. Both the input and output of the `transform` method are a dictionary. In the **Custom data transformation class** section, we will describe how to implement a data transformation class in more detail.\n\n## Data pipeline\n\nAs mentioned above, the inputs and outputs of all data transformations are dictionaries. Moreover, according to the \\[Convention on Datasets\\] (TODO) in OpenMMLab, the basic information of each sample in the dataset is also a dictionary. This way, we can connect all data transformation operations end to end and combine them into a data pipeline. This pipeline inputs the information dictionary of the samples in the dataset and outputs the information dictionary after a series of processing.\n\nTaking the classification task as an example, we show a typical data pipeline in the figure below. For each sample, the information stored in the dataset is a dictionary, as shown on the far left in the figure. After each data transformation operation represented by the blue block, a new field (marked in green) will be added to the data dictionary or an existing field (marked in orange) will be updated.\n\n<div align=center>\n<img src=\"https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png\" width=\"90%\"/>\n</div>\n\nThe data pipeline is a list of several data transformation configuration dictionaries in the configuration file. Each dataset needs to set the parameter `pipeline` to define the data preparation operations the dataset needs to perform. The configuration of the above data pipeline in the configuration file is as follows:\n\n```python\npipeline = [\n    dict(type='LoadImageFromFile'),\n    dict(type='Resize', size=256, keep_ratio=True),\n    dict(type='CenterCrop', crop_size=224),\n    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),\n    dict(type='ClsFormatBundle')\n]\n\ndataset = dict(\n    ...\n    pipeline=pipeline,\n    ...\n)\n```\n\n## Common data transformation classes\n\nThe commonly used data transformation classes can be roughly divided into data loading, data preprocessing and augmentation, and data formatting. In MMCV, we provide some commonly used classes as follows:\n\n### Data loading\n\nTo support the loading of large-scale datasets, data is usually not loaded when `Dataset` is initialized. Only the corresponding path is loaded. Therefore, it is necessary to load specific data in the data pipeline.\n\n|            Class            |                    Feature                     |\n| :-------------------------: | :--------------------------------------------: |\n| [`LoadImageFromFile`](TODO) |              Load from file path               |\n|  [`LoadAnnotations`](TODO)  | Load and organize the annotations (bbox, etc.) |\n\n### Data preprocessing and enhancement\n\nData preprocessing and augmentation usually involve transforming the image itself, such as cropping, padding, scaling, etc.\n\n|              Class               |                        Feature                         |\n| :------------------------------: | :----------------------------------------------------: |\n|          [`Pad`](TODO)           |                        Padding                         |\n|       [`CenterCrop`](TODO)       |                      Center crop                       |\n|       [`Normalize`](TODO)        |                  Image normalization                   |\n|         [`Resize`](TODO)         |         Resize to the specified size or ratio          |\n|      [`RandomResize`](TODO)      |  Scale the image randomly within the specified range   |\n| [`RandomMultiscaleResize`](TODO) | Scale the image to a random size from multiple options |\n|    [`RandomGrayscale`](TODO)     |                    Random grayscale                    |\n|       [`RandomFlip`](TODO)       |                      Random flip                       |\n|   [`MultiScaleFlipAug`](TODO)    |    Support scaling and flipping during the testing     |\n\n### Data formatting\n\nData formatting operations are type conversions performed on the data.\n\n|          Class          |                   Feature                    |\n| :---------------------: | :------------------------------------------: |\n|   [`ToTensor`](TODO)    | Convert the specified data to `torch.Tensor` |\n| [`ImageToTensor`](TODO) |     Convert the image to `torch.Tensor`      |\n\n## Customize data transformation classes\n\nTo implement a new data transformation class, you must inherit `BaseTransform` and implement the `transform` method. Here, we use a simple flip transform (`MyFlip`) as an example:\n\n```python\nimport random\nimport mmcv\nfrom mmcv.transforms import BaseTransform, TRANSFORMS\n\n@TRANSFORMS.register_module()\nclass MyFlip(BaseTransform):\n    def __init__(self, direction: str):\n        super().__init__()\n        self.direction = direction\n\n    def transform(self, results: dict) -> dict:\n        img = results['img']\n        results['img'] = mmcv.imflip(img, direction=self.direction)\n        return results\n```\n\nNow, we can instantiate `MyFlip` as a callable object to handle our data dictionary.\n\n```python\nimport numpy as np\n\ntransform = MyFlip(direction='horizontal')\ndata_dict = {'img': np.random.rand(224, 224, 3)}\ndata_dict = transform(data_dict)\nprocessed_img = data_dict['img']\n```\n\nAlternatively, use `MyFlip` transform in the `pipeline` of the config file.\n\n```python\npipeline = [\n    ...\n    dict(type='MyFlip', direction='horizontal'),\n    ...\n]\n```\n\nIt should be noted that if you want to use it in the configuration file, you must ensure that the file where the `MyFlip` class is located can be imported at the runtime.\n\n## Transform wrapper\n\nTransform wrappers are a special class of data transformations. They do not operate on images, labels or other information in the data dictionary by themselves. Instead, they enhance the behavior of data transformations defined in them.\n\n### KeyMapper\n\n`KeyMapper` is used to map fields in the data dictionary. For example, image processing transforms usually get their values from the `\"img\"` field in the data dictionary. But sometimes we want these transforms to handle images in other fields in the data dictionary, such as the `\"gt_img\"` field.\n\nWhen used with registry and configuration file, the field map wrapper should be used as follows:\n\n```python\npipeline = [\n    ...\n    dict(type='KeyMapper',\n        mapping={\n            'img': 'gt_img',  # map \"gt_img\" to \"img\"\n            'mask': ...,  # The \"mask\" field in the raw data is not used. That is, for wrapped data transformations, the \"mask\" field is not included in the data\n        },\n        auto_remap=True,  # remap \"img\" back to \"gt_img\" after the transformation\n        transforms=[\n            # only need to specify \"img\" in `RandomFlip`\n            dict(type='RandomFlip'),\n        ])\n    ...\n]\n```\n\nWith `KeyMapper`, we don't need to consider various possible input field names in the `transform` method when we implement the data transformation class. We only need to deal with the default fields.\n\n### RandomChoice and RandomApply\n\n`RandomChoice` is used to randomly select a data transformation pipeline from the given choices. With this wrapper, we can easily implement some data augmentation functions, such as AutoAugment.\n\nIn configuration file, you can use `RandomChoice` as follows:\n\n```python\npipeline = [\n    ...\n    dict(type='RandomChoice',\n        transforms=[\n            [\n                dict(type='Posterize', bits=4),\n                dict(type='Rotate', angle=30.)\n            ],  # the first combo option\n            [\n                dict(type='Equalize'),\n                dict(type='Rotate', angle=30)\n            ],  # the second combo option\n        ],\n        prob=[0.4, 0.6]  # the prob of each combo\n        )\n    ...\n]\n```\n\n`RandomApply` is used to randomly perform a combination of data transformations with a specified probability. For example:\n\n```python\npipeline = [\n    ...\n    dict(type='RandomApply',\n        transforms=[dict(type='Rotate', angle=30.)],\n        prob=0.3)  # perform the transformation with prob as 0.3\n    ...\n]\n```\n\n### TransformBroadcaster\n\nUsually, a data transformation class only reads the target of an operation from one field. While we can also use `KeyMapper` to change the fields read, there is no way to apply transformations to the data of multiple fields at once. To achieve this, we need to use the multi-target extension wrapper `TransformBroadcaster`.\n\n`TransformBroadcaster` has two uses, one is to apply data transformation to multiple specified fields, and the other is to apply data transformation to a group of targets under a field.\n\n1. Apply to multiple fields\n\n   Suppose we need to apply a data transformation to images in two fields `\"lq\"` (low-quality) and `\"gt\"` (ground-truth).\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           # apply to the \"lq\" and \"gt\" fields respectively, and set the \"img\" field to both\n           mapping={'img': ['lq', 'gt']},\n           # remap the \"img\" field back to the original field after the transformation\n           auto_remap=True,\n           # whether to share random variables in the transformation of each target\n           # more introduction will be referred in the following chapters (random variable sharing)\n           share_random_params=True,\n           transforms=[\n               # only need to manipulate the \"img\" field in the `RandomFlip` class\n               dict(type='RandomFlip'),\n           ])\n   ]\n   ```\n\n   In the `mapping` setting of the multi-target extension, we can also use `...` to ignore the specified original field. As shown in the following example, the wrapped `RandomCrop` will crop the image in the field `\"img\"` and update the size of the cropped image if the field `\"img_shape\"` exists. If we want to do the same random cropping for both image fields `\"lq\"` and `\"gt\"` at the same time but update the `\"img_shape\"` field only once, we can do it as in the example:\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           mapping={\n               'img': ['lq', 'gt'],\n               'img_shape': ['img_shape', ...],\n            },\n           # remap the \"img\" and \"img_shape\" fields back to their original fields after the transformation\n           auto_remap=True,\n           # whether to share random variables in the transformation of each target\n           # more introduction will be referred in the following chapters (random variable sharing)\n           share_random_params=True,\n           transforms=[\n               # \"img\" and \"img_shape\" fields are manipulated in the `RandomCrop` class\n               # if \"img_shape\" is missing, only operate on \"img\"\n               dict(type='RandomCrop'),\n           ])\n   ]\n   ```\n\n2. A set of targets applied to a field\n\n   Suppose we need to apply a data transformation to the `\"images\"` field, which is a list of images.\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           # map each image under the \"images\" field to the \"img\" field\n           mapping={'img': 'images'},\n           # remap the images under the \"img\" field back to the list in the \"images\" field after the transformation\n           auto_remap=True,\n           # whether to share random variables in the transformation of each target\n           share_random_params=True,\n           transforms=[\n               # in the `RandomFlip` transformation class, we only need to manipulate the \"img\" field\n               dict(type='RandomFlip'),\n           ])\n   ]\n   ```\n\n#### Decorator `cache_randomness`\n\nIn `TransformBroadcaster`, we provide the `share_random_params` option to support sharing random states across multiple data transformations. For example, in a super-resolution task, we want to apply **the same** random transformations **simultaneously** to the low-resolution image and the original image. If we use this function in a custom data transformation class, we need to mark which random variables support sharing in the class. This can be achieved with the decorator `cache_randomness`.\n\nTaking `MyFlip` from the above example, we want to perform flipping randomly with a certain probability:\n\n```python\nfrom mmcv.transforms.utils import cache_randomness\n\n@TRANSFORMS.register_module()\nclass MyRandomFlip(BaseTransform):\n    def __init__(self, prob: float, direction: str):\n        super().__init__()\n        self.prob = prob\n        self.direction = direction\n\n    @cache_randomness  # label the output of the method as a shareable random variable\n    def do_flip(self):\n        flip = True if random.random() > self.prob else False\n        return flip\n\n    def transform(self, results: dict) -> dict:\n        img = results['img']\n        if self.do_flip():\n            results['img'] = mmcv.imflip(img, direction=self.direction)\n        return results\n```\n\nIn the above example, we decorate the `do_flip` method with `cache_randomness`, marking the method return value `flip` as a random variable that supports sharing. Therefore, in the transformation of `TransformBroadcaster` to multiple targets, the value of this variable will remain the same.\n\n#### Decorator `avoid_cache_randomness`\n\nIn some cases, we cannot separate the process of generating random variables in data transformation into a class method. For example, modules from third-party libraries used in data transformation encapsulate the relevant parts of random variables inside, making them impossible to be extracted as class methods for data transformation. Such data transformations cannot support shared random variables through the decorator `cache_randomness` annotation, and thus cannot share random variables during multi-objective expansion.\n\nTo avoid misuse of such data transformations in multi-object extensions, we provide another decorator, `avoid_cache_randomness`, to mark such data transformations:\n\n```python\nfrom mmcv.transforms.utils import avoid_cache_randomness\n\n@TRANSFORMS.register_module()\n@avoid_cache_randomness\nclass MyRandomTransform(BaseTransform):\n\n    def transform(self, results: dict) -> dict:\n        ...\n```\n\nData transformation classes marked with `avoid_cache_randomness` will throw an exception when their instance is wrapped by `TransformBroadcaster` and the parameter `share_random_params` is set to True. This reminds the user not to use it in this way.\n\nThere are a few things to keep in mind when using `avoid_cache_randomness`:\n\n1. `avoid_cache_randomness` is only used to decorate data transformation classes (subclasses of `BaseTransfrom`) and cannot be used to decorate other general classes, class methods, or functions\n2. When a data transformation decorated with `avoid_cache_randomness` is used as a base class, its subclasses **will not inherit** its feature. If the subclass is still unable to share random variables, `avoid_cache_randomness` should be used again.\n3. A data transformation needs to be modified with `avoid_cache_randomness` only when a data transformation is random and cannot share its random parameters. Data transformations without randomness require no decoration\n"
  },
  {
    "path": "docs/en/understand_mmcv/ops.md",
    "content": "## ops\n\nWe implement common ops used in detection, segmentation, etc.\n\n| Device                       | CPU | CUDA | MLU | MPS | Ascend |\n| ---------------------------- | --- | ---- | --- | --- | ------ |\n| ActiveRotatedFilter          | √   | √    |     |     | √      |\n| AssignScoreWithK             |     | √    |     |     |        |\n| BallQuery                    |     | √    | √   |     | √      |\n| BBoxOverlaps                 |     | √    | √   | √   | √      |\n| BorderAlign                  |     | √    |     |     |        |\n| BoxIouRotated                | √   | √    | √   |     | √      |\n| BoxIouQuadri                 | √   | √    |     |     |        |\n| CARAFE                       |     | √    | √   |     |        |\n| ChamferDistance              |     | √    |     |     | √      |\n| CrissCrossAttention          |     | √    |     |     |        |\n| ContourExpand                | √   |      |     |     |        |\n| ConvexIoU                    |     | √    |     |     |        |\n| CornerPool                   |     | √    |     |     |        |\n| Correlation                  |     | √    |     |     |        |\n| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |\n| Deformable RoIPool           |     | √    | √   |     | √      |\n| DiffIoURotated               |     | √    | √   |     |        |\n| DynamicScatter               |     | √    | √   |     |        |\n| FurthestPointSample          |     | √    |     |     |        |\n| FurthestPointSampleWithDist  |     | √    |     |     |        |\n| FusedBiasLeakyrelu           |     | √    |     |     | √      |\n| GatherPoints                 |     | √    |     |     | √      |\n| GroupPoints                  |     | √    |     |     |        |\n| Iou3d                        |     | √    | √   |     |        |\n| KNN                          |     | √    |     |     |        |\n| MaskedConv                   |     | √    | √   |     | √      |\n| MergeCells                   |     | √    |     |     |        |\n| MinAreaPolygon               |     | √    |     |     |        |\n| ModulatedDeformConv2d        | √   | √    | √   |     | √      |\n| MultiScaleDeformableAttn     |     | √    | √   |     | √      |\n| NMS                          | √   | √    | √   |     | √      |\n| NMSRotated                   | √   | √    | √   |     | √      |\n| NMSQuadri                    | √   | √    |     |     |        |\n| PixelGroup                   | √   |      |     |     |        |\n| PointsInBoxes                | √   | √    |     |     |        |\n| PointsInPolygons             |     | √    |     |     | √      |\n| PSAMask                      | √   | √    | √   |     | √      |\n| RotatedFeatureAlign          | √   | √    | √   |     | √      |\n| RoIPointPool3d               |     | √    | √   |     |        |\n| RoIPool                      |     | √    | √   |     | √      |\n| RoIAlignRotated              | √   | √    | √   |     | √      |\n| RiRoIAlignRotated            |     | √    |     |     |        |\n| RoIAlign                     | √   | √    | √   |     | √      |\n| RoIAwarePool3d               |     | √    | √   |     |        |\n| SAConv2d                     |     | √    |     |     |        |\n| SigmoidFocalLoss             |     | √    | √   |     | √      |\n| SoftmaxFocalLoss             |     | √    |     |     | √      |\n| SoftNMS                      |     | √    |     |     |        |\n| Sparse Convolution           |     | √    | √   |     |        |\n| Synchronized BatchNorm       |     | √    |     |     |        |\n| ThreeInterpolate             |     | √    |     |     |        |\n| ThreeNN                      |     | √    | √   |     |        |\n| TINShift                     |     | √    | √   |     |        |\n| UpFirDn2d                    |     | √    |     |     |        |\n| Voxelization                 | √   | √    | √   |     | √      |\n| PrRoIPool                    |     | √    |     |     |        |\n| BezierAlign                  | √   | √    |     |     |        |\n| BiasAct                      |     | √    |     |     |        |\n| FilteredLrelu                |     | √    |     |     |        |\n| Conv2dGradfix                |     | √    |     |     |        |\n"
  },
  {
    "path": "docs/en/understand_mmcv/visualization.md",
    "content": "## Visualization\n\n`mmcv` can show images and annotations (currently supported types include bounding boxes).\n\n```python\n# show an image file\nmmcv.imshow('a.jpg')\n\n# show a loaded image\nimg = np.random.rand(100, 100, 3)\nmmcv.imshow(img)\n\n# show image with bounding boxes\nimg = np.random.rand(100, 100, 3)\nbboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])\nmmcv.imshow_bboxes(img, bboxes)\n```\n\n`mmcv` can also visualize special images such as optical flows.\n\n```python\nflow = mmcv.flowread('test.flo')\nmmcv.flowshow(flow)\n```\n"
  },
  {
    "path": "docs/zh_cn/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/zh_cn/_static/css/readthedocs.css",
    "content": ".header-logo {\n    background-image: url(\"../image/mmcv-logo.png\");\n    background-size: 85px 40px;\n    height: 40px;\n    width: 85px;\n}\n\ntable.colwidths-auto td {\n    width: 50%\n}\n"
  },
  {
    "path": "docs/zh_cn/_static/version.json",
    "content": "{\n    \"Linux\": [\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.5\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.0\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"9.2\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.4.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.5.x\",\n            \"mmcv\": [\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        }\n    ],\n    \"Windows\": [\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"12.1\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.8\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.7\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.6\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.5\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.3\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"11.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.2\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"10.1\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.3.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.2.x\",\n            \"mmcv\": [\n                \"2.2.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.2.0\",\n                \"2.1.0\",\n                \"2.0.1\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0rc1\"\n            ]\n        }\n    ],\n    \"macOS\": [\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.1.x\",\n            \"mmcv\": [\n                \"2.1.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"2.0.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"mps\",\n            \"torch\": \"1.13.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.12.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.11.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.10.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.9.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.8.x\",\n            \"mmcv\": [\n                \"2.1.0\",\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\",\n                \"2.0.0\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.7.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        },\n        {\n            \"cuda\": \"cpu\",\n            \"torch\": \"1.6.x\",\n            \"mmcv\": [\n                \"2.0.0rc4\",\n                \"2.0.0rc3\",\n                \"2.0.0rc2\"\n            ]\n        }\n    ]\n}\n"
  },
  {
    "path": "docs/zh_cn/_templates/classtemplate.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n.. currentmodule:: {{ module }}\n\n\n{{ name | underline}}\n\n.. autoclass:: {{ name }}\n    :members:\n\n\n..\n  autogenerated from source/_templates/classtemplate.rst\n  note it does not have :inherited-members:\n"
  },
  {
    "path": "docs/zh_cn/api/arraymisc.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.arraymisc\n===================================\n\n.. contents:: mmcv.arraymisc\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.arraymisc\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   quantize\n   dequantize\n"
  },
  {
    "path": "docs/zh_cn/api/cnn.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.cnn\n===================================\n\n.. contents:: mmcv.cnn\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.cnn\n\nModule\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   ContextBlock\n   Conv2d\n   Conv3d\n   ConvAWS2d\n   ConvModule\n   ConvTranspose2d\n   ConvTranspose3d\n   ConvWS2d\n   DepthwiseSeparableConvModule\n   GeneralizedAttention\n   HSigmoid\n   HSwish\n   LayerScale\n   Linear\n   MaxPool2d\n   MaxPool3d\n   NonLocal1d\n   NonLocal2d\n   NonLocal3d\n   Scale\n   Swish\n   Conv2dRFSearchOp\n\nBuild Function\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   build_activation_layer\n   build_conv_layer\n   build_norm_layer\n   build_padding_layer\n   build_plugin_layer\n   build_upsample_layer\n\nMiscellaneous\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   fuse_conv_bn\n   conv_ws_2d\n   is_norm\n   make_res_layer\n   make_vgg_layer\n   get_model_complexity_info\n"
  },
  {
    "path": "docs/zh_cn/api/image.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.image\n===================================\n\n.. contents:: mmcv.image\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.image\n\nIO\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   imfrombytes\n   imread\n   imwrite\n   use_backend\n\nColor Space\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   bgr2gray\n   bgr2hls\n   bgr2hsv\n   bgr2rgb\n   bgr2ycbcr\n   gray2bgr\n   gray2rgb\n   hls2bgr\n   hsv2bgr\n   imconvert\n   rgb2bgr\n   rgb2gray\n   rgb2ycbcr\n   ycbcr2bgr\n   ycbcr2rgb\n\nGeometric\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   cutout\n   imcrop\n   imflip\n   impad\n   impad_to_multiple\n   imrescale\n   imresize\n   imresize_like\n   imresize_to_multiple\n   imrotate\n   imshear\n   imtranslate\n   rescale_size\n\nPhotometric\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   adjust_brightness\n   adjust_color\n   adjust_contrast\n   adjust_hue\n   adjust_lighting\n   adjust_sharpness\n   auto_contrast\n   clahe\n   imdenormalize\n   imequalize\n   iminvert\n   imnormalize\n   lut_transform\n   posterize\n   solarize\n\nMiscellaneous\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   tensor2imgs\n"
  },
  {
    "path": "docs/zh_cn/api/ops.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.ops\n===================================\n\n.. contents:: mmcv.ops\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.ops\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   BorderAlign\n   CARAFE\n   CARAFENaive\n   CARAFEPack\n   Conv2d\n   ConvTranspose2d\n   CornerPool\n   Correlation\n   CrissCrossAttention\n   DeformConv2d\n   DeformConv2dPack\n   DeformRoIPool\n   DeformRoIPoolPack\n   DynamicScatter\n   FusedBiasLeakyReLU\n   GroupAll\n   Linear\n   MaskedConv2d\n   MaxPool2d\n   ModulatedDeformConv2d\n   ModulatedDeformConv2dPack\n   ModulatedDeformRoIPoolPack\n   MultiScaleDeformableAttention\n   PSAMask\n   PointsSampler\n   PrRoIPool\n   QueryAndGroup\n   RiRoIAlignRotated\n   RoIAlign\n   RoIAlignRotated\n   RoIAwarePool3d\n   RoIPointPool3d\n   RoIPool\n   SAConv2d\n   SigmoidFocalLoss\n   SimpleRoIAlign\n   SoftmaxFocalLoss\n   SparseConv2d\n   SparseConv3d\n   SparseConvTensor\n   SparseConvTranspose2d\n   SparseConvTranspose3d\n   SparseInverseConv2d\n   SparseInverseConv3d\n   SparseMaxPool2d\n   SparseMaxPool3d\n   SparseModule\n   SparseSequential\n   SubMConv2d\n   SubMConv3d\n   SyncBatchNorm\n   TINShift\n   Voxelization\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   active_rotated_filter\n   assign_score_withk\n   ball_query\n   batched_nms\n   bbox_overlaps\n   border_align\n   box_iou_rotated\n   boxes_iou3d\n   boxes_iou_bev\n   boxes_overlap_bev\n   carafe\n   carafe_naive\n   chamfer_distance\n   contour_expand\n   convex_giou\n   convex_iou\n   deform_conv2d\n   deform_roi_pool\n   diff_iou_rotated_2d\n   diff_iou_rotated_3d\n   dynamic_scatter\n   furthest_point_sample\n   furthest_point_sample_with_dist\n   fused_bias_leakyrelu\n   gather_points\n   grouping_operation\n   knn\n   masked_conv2d\n   min_area_polygons\n   modulated_deform_conv2d\n   nms\n   nms3d\n   nms3d_normal\n   nms_bev\n   nms_match\n   nms_normal_bev\n   nms_rotated\n   pixel_group\n   point_sample\n   points_in_boxes_all\n   points_in_boxes_cpu\n   points_in_boxes_part\n   points_in_polygons\n   prroi_pool\n   rel_roi_point_to_rel_img_point\n   riroi_align_rotated\n   roi_align\n   roi_align_rotated\n   roi_pool\n   rotated_feature_align\n   scatter_nd\n   sigmoid_focal_loss\n   soft_nms\n   softmax_focal_loss\n   three_interpolate\n   three_nn\n   tin_shift\n   upfirdn2d\n   voxelization\n"
  },
  {
    "path": "docs/zh_cn/api/transforms.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.transforms\n===================================\n\n.. currentmodule:: mmcv.transforms\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   BaseTransform\n   TestTimeAug\n\nLoading\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   LoadAnnotations\n   LoadImageFromFile\n\nProcessing\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   CenterCrop\n   MultiScaleFlipAug\n   Normalize\n   Pad\n   RandomChoiceResize\n   RandomFlip\n   RandomGrayscale\n   RandomResize\n   Resize\n   ToTensor\n   ImageToTensor\n\nWrapper\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   Compose\n   KeyMapper\n   RandomApply\n   RandomChoice\n   TransformBroadcaster\n"
  },
  {
    "path": "docs/zh_cn/api/utils.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.utils\n===================================\n\n.. contents:: mmcv.utils\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.utils\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   IS_CUDA_AVAILABLE\n   IS_MLU_AVAILABLE\n   IS_MPS_AVAILABLE\n   collect_env\n   jit\n   skip_no_elena\n"
  },
  {
    "path": "docs/zh_cn/api/video.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.video\n===================================\n\n.. contents:: mmcv.video\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.video\n\nIO\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   VideoReader\n   Cache\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   frames2video\n\nOptical Flow\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   dequantize_flow\n   flow_from_bytes\n   flow_warp\n   flowread\n   flowwrite\n   quantize_flow\n   sparse_flow_from_bytes\n\nVideo Processing\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   concat_video\n   convert_video\n   cut_video\n   resize_video\n"
  },
  {
    "path": "docs/zh_cn/api/visualization.rst",
    "content": ".. role:: hidden\n    :class: hidden-section\n\nmmcv.visualization\n===================================\n\n.. contents:: mmcv.visualization\n   :depth: 2\n   :local:\n   :backlinks: top\n\n.. currentmodule:: mmcv.visualization\n\nColor\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n   :template: classtemplate.rst\n\n   Color\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   color_val\n\nImage\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   imshow\n   imshow_bboxes\n   imshow_det_bboxes\n\nOptical Flow\n----------------\n\n.. autosummary::\n   :toctree: generated\n   :nosignatures:\n\n   flow2rgb\n   flowshow\n   make_color_wheel\n"
  },
  {
    "path": "docs/zh_cn/community/code_style.md",
    "content": "## 代码规范\n\n### 代码规范标准\n\n#### PEP 8 —— Python 官方代码规范\n\n[Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/)，包含了以下几个方面的内容：\n\n- 代码布局，介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题：当我的代码较长，无法在一行写下时，何处可以断行？\n\n- 表达式，介绍了 Python 中表达式空格相关的一些风格规范。\n\n- 尾随逗号相关的规范。当列表较长，无法一行写下而写成如下逐行列表时，推荐在末项后加逗号，从而便于追加选项、版本控制等。\n\n  ```python\n  # Correct:\n  FILES = ['setup.cfg', 'tox.ini']\n  # Correct:\n  FILES = [\n      'setup.cfg',\n      'tox.ini',\n  ]\n  # Wrong:\n  FILES = ['setup.cfg', 'tox.ini',]\n  # Wrong:\n  FILES = [\n      'setup.cfg',\n      'tox.ini'\n  ]\n  ```\n\n- 命名相关规范、注释相关规范、类型注解相关规范，我们将在后续章节中做详细介绍。\n\n  \"A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important.\" PEP 8 -- Style Guide for Python Code\n\n:::{note}\nPEP 8 的代码规范并不是绝对的，项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置，请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子：\n\n```python\n# Correct:\nhypot2 = x*x + y*y\n# Wrong:\nhypot2 = x * x + y * y\n```\n\n这一规范是为了指示不同优先级，但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项，因而格式规范工具不会按照推荐样式格式化，以设置为准。\n:::\n\n#### Google 开源项目风格指南\n\n[Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html)，包括了 Python 相关的章节。相较于 PEP 8，该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。\n\n其中，语言规范对 Python 中很多语言特性进行了优缺点的分析，并给出了使用指导意见，如异常、Lambda 表达式、列表推导式、metaclass 等。\n\n风格规范的内容与 PEP 8 较为接近，大部分约定建立在 PEP 8 的基础上，也有一些更为详细的约定，如函数长度、TODO 注释、文件与 socket 对象的访问等。\n\n推荐将该指南作为参考进行开发，但不必严格遵照，一来该指南存在一些 Python 2 兼容需求，例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中，这一要求是不必要的，依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件，不必对一些高级技巧过于避讳，尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要，并寻求其他开发人员的广泛评估。\n\n另外需要注意的一处规范是关于包的导入，在该指南中，要求导入本地包时必须使用路径全称，且导入的每一个模块都应当单独成行，通常这是不必要的，而且也不符合目前项目的开发惯例，此处进行如下约定：\n\n```python\n# Correct\nfrom mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d,\n                             Linear)\nfrom ..utils import ext_loader\n\n# Wrong\nfrom mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \\\n                            Linear  # 使用括号进行连接，而不是反斜杠\nfrom ...utils import is_str  # 最多向上回溯一层，过多的回溯容易导致结构混乱\n```\n\nOpenMMLab 项目使用 pre-commit 工具自动格式化代码，详情见[贡献代码](./contributing.md#代码风格)。\n\n### 命名规范\n\n#### 命名规范的重要性\n\n优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求，使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识，以及良好的表达能力，从而使读者根据名称就能了解其含义，甚至帮助了解该段代码的功能。\n\n#### 基础命名规范\n\n| 类型            | 公有             | 私有               |\n| --------------- | ---------------- | ------------------ |\n| 模块            | lower_with_under | \\_lower_with_under |\n| 包              | lower_with_under |                    |\n| 类              | CapWords         | \\_CapWords         |\n| 异常            | CapWordsError    |                    |\n| 函数（方法）    | lower_with_under | \\_lower_with_under |\n| 函数 / 方法参数 | lower_with_under |                    |\n| 全局 / 类内常量 | CAPS_WITH_UNDER  | \\_CAPS_WITH_UNDER  |\n| 全局 / 类内变量 | lower_with_under | \\_lower_with_under |\n| 变量            | lower_with_under | \\_lower_with_under |\n| 局部变量        | lower_with_under |                    |\n\n注意：\n\n- 尽量避免变量名与保留字冲突，特殊情况下如不可避免，可使用一个后置下划线，如 class\\_\n- 尽量不要使用过于简单的命名，除了约定俗成的循环变量 i，文件变量 f，错误变量 e 等。\n- 不会被用到的变量可以命名为 \\_，逻辑检查器会将其忽略。\n\n#### 命名技巧\n\n良好的变量命名需要保证三点：\n\n1. 含义准确，没有歧义\n2. 长短适中\n3. 前后统一\n\n```python\n# Wrong\nclass Masks(metaclass=ABCMeta):  # 命名无法表现基类；Instance or Semantic？\n    pass\n\n# Correct\nclass BaseInstanceMasks(metaclass=ABCMeta):\n    pass\n\n# Wrong，不同地方含义相同的变量尽量用统一的命名\ndef __init__(self, inplanes, planes):\n    pass\n\ndef __init__(self, in_channels, out_channels):\n    pass\n```\n\n常见的函数命名方法：\n\n- 动宾命名法：crop_img, init_weights\n- 动宾倒置命名法：imread, bbox_flip\n\n注意函数命名与参数的顺序，保证主语在前，符合语言习惯：\n\n- check_keys_exist(key, container)\n- check_keys_contain(container, key)\n\n注意避免非常规或统一约定的缩写，如 nb -> num_blocks，in_nc -> in_channels\n\n### docstring 规范\n\n#### 为什么要写 docstring\n\ndocstring 是对一个类、一个函数功能与 API 接口的详细描述，有两个功能，一是帮助其他开发者了解代码功能，方便 debug 和复用代码；二是在 Readthedocs 文档中自动生成相关的 API reference 文档，帮助不了解源代码的社区用户使用相关功能。\n\n#### 如何写 docstring\n\n与注释不同，一份规范的 docstring 有着严格的格式要求，以便于 Python 解释器以及 sphinx 进行文档解析，详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式，参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。\n\n1. 模块文档\n\n   代码风格规范推荐为每一个模块（即 Python 文件）编写一个 docstring，但目前 OpenMMLab 项目大部分没有此类 docstring，因此不做硬性要求。\n\n   ```python\n   \"\"\"A one line summary of the module or program, terminated by a period.\n\n   Leave one blank line. The rest of this docstring should contain an\n   overall description of the module or program. Optionally, it may also\n   contain a brief description of exported classes and functions and/or usage\n   examples.\n\n   Typical usage example:\n\n   foo = ClassFoo()\n   bar = foo.FunctionBar()\n   \"\"\"\n   ```\n\n2. 类文档\n\n   类文档是我们最常需要编写的，此处，按照 OpenMMLab 的惯例，我们使用了与 Google 风格不同的写法。如下例所示，文档中没有使用 Attributes 描述类属性，而是使用 Args 描述 __init__ 函数的参数。\n\n   在 Args 中，遵照 `parameter (type): Description.` 的格式，描述每一个参数类型和功能。其中，多种类型可使用 `(float or str)` 的写法，可以为 None 的参数可以写为 `(int, optional)`。\n\n   ```python\n   class BaseRunner(metaclass=ABCMeta):\n       \"\"\"The base class of Runner, a training helper for PyTorch.\n\n       All subclasses should implement the following APIs:\n\n       - ``run()``\n       - ``train()``\n       - ``val()``\n       - ``save_checkpoint()``\n\n       Args:\n           model (:obj:`torch.nn.Module`): The model to be run.\n           batch_processor (callable, optional): A callable method that process\n               a data batch. The interface of this method should be\n               ``batch_processor(model, data, train_mode) -> dict``.\n               Defaults to None.\n           optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be\n               either an optimizer (in most cases) or a dict of optimizers\n               (in models that requires more than one optimizer, e.g., GAN).\n               Defaults to None.\n           work_dir (str, optional): The working directory to save checkpoints\n               and logs. Defaults to None.\n           logger (:obj:`logging.Logger`): Logger used during training.\n                Defaults to None. (The default value is just for backward\n                compatibility)\n           meta (dict, optional): A dict records some import information such as\n               environment info and seed, which will be logged in logger hook.\n               Defaults to None.\n           max_epochs (int, optional): Total training epochs. Defaults to None.\n           max_iters (int, optional): Total training iterations. Defaults to None.\n       \"\"\"\n\n       def __init__(self,\n                    model,\n                    batch_processor=None,\n                    optimizer=None,\n                    work_dir=None,\n                    logger=None,\n                    meta=None,\n                    max_iters=None,\n                    max_epochs=None):\n           ...\n   ```\n\n   另外，在一些算法实现的主体类中，建议加入原论文的链接；如果参考了其他开源代码的实现，则应加入 modified from，而如果是直接复制了其他代码库的实现，则应加入 copied from ，并注意源码的 License。如有必要，也可以通过 .. math:: 来加入数学公式\n\n   ```python\n   # 参考实现\n   # This func is modified from `detectron2\n   # <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.\n\n   # 复制代码\n   # This code was copied from the `ubelt\n   # library<https://github.com/Erotemic/ubelt>`_.\n\n   # 引用论文 & 添加公式\n   class LabelSmoothLoss(nn.Module):\n       r\"\"\"Initializer for the label smoothed cross entropy loss.\n\n       Refers to `Rethinking the Inception Architecture for Computer Vision\n       <https://arxiv.org/abs/1512.00567>`_.\n\n       This decreases gap between output scores and encourages generalization.\n       Labels provided to forward can be one-hot like vectors (NxC) or class\n       indices (Nx1).\n       And this accepts linear combination of one-hot like labels from mixup or\n       cutmix except multi-label task.\n\n       Args:\n           label_smooth_val (float): The degree of label smoothing.\n           num_classes (int, optional): Number of classes. Defaults to None.\n           mode (str): Refers to notes, Options are \"original\", \"classy_vision\",\n               \"multi_label\". Defaults to \"classy_vision\".\n           reduction (str): The method used to reduce the loss.\n               Options are \"none\", \"mean\" and \"sum\". Defaults to 'mean'.\n           loss_weight (float):  Weight of the loss. Defaults to 1.0.\n\n       Note:\n           if the ``mode`` is \"original\", this will use the same label smooth\n           method as the original paper as:\n\n           .. math::\n               (1-\\epsilon)\\delta_{k, y} + \\frac{\\epsilon}{K}\n\n           where :math:`\\epsilon` is the ``label_smooth_val``, :math:`K` is\n           the ``num_classes`` and :math:`\\delta_{k,y}` is Dirac delta,\n           which equals 1 for k=y and 0 otherwise.\n\n           if the ``mode`` is \"classy_vision\", this will use the same label\n           smooth method as the `facebookresearch/ClassyVision\n           <https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/losses/label_smoothing_loss.py>`_ repo as:\n\n           .. math::\n               \\frac{\\delta_{k, y} + \\epsilon/K}{1+\\epsilon}\n\n           if the ``mode`` is \"multi_label\", this will accept labels from\n           multi-label task and smoothing them as:\n\n           .. math::\n               (1-2\\epsilon)\\delta_{k, y} + \\epsilon\n   ```\n\n```{note}\n注意 \\`\\`here\\`\\`、\\`here\\`、\"here\" 三种引号功能是不同。\n\n在 reStructured 语法中，\\`\\`here\\`\\` 表示一段代码；\\`here\\` 表示斜体；\"here\" 无特殊含义，一般可用来表示字符串。其中 \\`here\\` 的用法与 Markdown 中不同，需要多加留意。\n另外还有 :obj:\\`type\\` 这种更规范的表示类的写法，但鉴于长度，不做特别要求，一般仅用于表示非常用类型。\n```\n\n3. 方法（函数）文档\n\n   函数文档与类文档的结构基本一致，但需要加入返回值文档。对于较为复杂的函数和类，可以使用 Examples 字段加入示例；如果需要对参数加入一些较长的备注，可以加入 Note 字段进行说明。\n\n   对于使用较为复杂的类或函数，比起看大段大段的说明文字和参数文档，添加合适的示例更能帮助用户迅速了解其用法。需要注意的是，这些示例最好是能够直接在 Python 交互式环境中运行的，并给出一些相对应的结果。如果存在多个示例，可以使用注释简单说明每段示例，也能起到分隔作用。\n\n   ```python\n   def import_modules_from_strings(imports, allow_failed_imports=False):\n       \"\"\"Import modules from the given list of strings.\n\n       Args:\n           imports (list | str | None): The given module names to be imported.\n           allow_failed_imports (bool): If True, the failed imports will return\n               None. Otherwise, an ImportError is raise. Defaults to False.\n\n       Returns:\n           List[module] | module | None: The imported modules.\n           All these three lines in docstring will be compiled into the same\n           line in readthedocs.\n\n       Examples:\n           >>> osp, sys = import_modules_from_strings(\n           ...     ['os.path', 'sys'])\n           >>> import os.path as osp_\n           >>> import sys as sys_\n           >>> assert osp == osp_\n           >>> assert sys == sys_\n       \"\"\"\n       ...\n   ```\n\n   如果函数接口在某个版本发生了变化，需要在 docstring 中加入相关的说明，必要时添加 Note 或者 Warning 进行说明，例如：\n\n   ```python\n   class CheckpointHook(Hook):\n       \"\"\"Save checkpoints periodically.\n\n       Args:\n           out_dir (str, optional): The root directory to save checkpoints. If\n               not specified, ``runner.work_dir`` will be used by default. If\n               specified, the ``out_dir`` will be the concatenation of\n               ``out_dir`` and the last level directory of ``runner.work_dir``.\n               Defaults to None. `Changed in version 1.3.15.`\n           file_client_args (dict, optional): Arguments to instantiate a\n               FileClient. See :class:`mmcv.fileio.FileClient` for details.\n               Defaults to None. `New in version 1.3.15.`\n\n       Warning:\n           Before v1.3.15, the ``out_dir`` argument indicates the path where the\n           checkpoint is stored. However, in v1.3.15 and later, ``out_dir``\n           indicates the root directory and the final path to save checkpoint is\n           the concatenation of out_dir and the last level directory of\n           ``runner.work_dir``. Suppose the value of ``out_dir`` is\n           \"/path/of/A\" and the value of ``runner.work_dir`` is \"/path/of/B\",\n           then the final path will be \"/path/of/A/B\".\n   ```\n\n   如果参数或返回值里带有需要展开描述字段的 dict，则应该采用如下格式：\n\n   ```python\n   def func(x):\n       r\"\"\"\n       Args:\n           x (None): A dict with 2 keys, ``padded_targets``, and ``targets``.\n\n               - ``targets`` (list[Tensor]): A list of tensors.\n                 Each tensor has the shape of :math:`(T_i)`. Each\n                 element is the index of a character.\n               - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.\n                 Each item is the length of a word.\n\n       Returns:\n           dict: A dict with 2 keys, ``padded_targets``, and ``targets``.\n\n           - ``targets`` (list[Tensor]): A list of tensors.\n             Each tensor has the shape of :math:`(T_i)`. Each\n             element is the index of a character.\n           - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.\n             Each item is the length of a word.\n       \"\"\"\n       return x\n   ```\n\n```{important}\n为了生成 readthedocs 文档，文档的编写需要按照 ReStructrued 文档格式，否则会产生文档渲染错误，在提交 PR 前，最好生成并预览一下文档效果。\n语法规范参考：\n\n- [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#)\n- [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google)\n```\n\n### 注释规范\n\n#### 为什么要写注释\n\n对于一个开源项目，团队合作以及社区之间的合作是必不可少的，因而尤其要重视合理的注释。不写注释的代码，很有可能过几个月自己也难以理解，造成额外的阅读和修改成本。\n\n#### 如何写注释\n\n最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下，那么你应该现在就给它写注释。对于复杂的操作，应该在其操作开始前写上若干行注释。对于不是一目了然的代码，应在其行尾添加注释。\n—— Google 开源项目风格指南\n\n```python\n# We use a weighted dictionary search to find out where i is in\n# the array. We extrapolate position based on the largest num\n# in the array and the array size and then do binary search to\n# get the exact number.\nif i & (i-1) == 0:  # True if i is 0 or a power of 2.\n```\n\n为了提高可读性, 注释应该至少离开代码2个空格.\n另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么.\n—— Google 开源项目风格指南\n\n```python\n# Wrong:\n# Now go through the b array and make sure whenever i occurs\n# the next element is i+1\n\n# Wrong:\nif i & (i-1) == 0:  # True if i bitwise and i-1 is 0.\n```\n\n在注释中，可以使用 Markdown 语法，因为开发人员通常熟悉 Markdown 语法，这样可以便于交流理解，如可使用单反引号表示代码和变量（注意不要和 docstring 中的 ReStructured 语法混淆）\n\n```python\n# `_reversed_padding_repeated_twice` is the padding to be passed to\n# `F.pad` if needed (e.g., for non-zero padding types that are\n# implemented as two ops: padding + conv). `F.pad` accepts paddings in\n# reverse order than the dimension.\nself._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)\n```\n\n#### 注释示例\n\n1. 出自 `mmcv/utils/registry.py`，对于较为复杂的逻辑结构，通过注释，明确了优先级关系。\n\n   ```python\n   # self.build_func will be set with the following priority:\n   # 1. build_func\n   # 2. parent.build_func\n   # 3. build_from_cfg\n   if build_func is None:\n       if parent is not None:\n           self.build_func = parent.build_func\n       else:\n           self.build_func = build_from_cfg\n   else:\n       self.build_func = build_func\n   ```\n\n2. 出自 `mmcv/runner/checkpoint.py`，对于 bug 修复中的一些特殊处理，可以附带相关的 issue 链接，帮助其他人了解 bug 背景。\n\n   ```python\n   def _save_ckpt(checkpoint, file):\n       # The 1.6 release of PyTorch switched torch.save to use a new\n       # zipfile-based file format. It will cause RuntimeError when a\n       # checkpoint was saved in high version (PyTorch version>=1.6.0) but\n       # loaded in low version (PyTorch version<1.6.0). More details at\n       # https://github.com/open-mmlab/mmpose/issues/904\n       if digit_version(TORCH_VERSION) >= digit_version('1.6.0'):\n           torch.save(checkpoint, file, _use_new_zipfile_serialization=False)\n       else:\n           torch.save(checkpoint, file)\n   ```\n\n### 类型注解\n\n#### 为什么要写类型注解\n\n类型注解是对函数中变量的类型做限定或提示，为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。\nPython 没有对类型做强制限制，类型注解只起到一个提示作用，通常你的 IDE 会解析这些类型注解，然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具，这些工具会根据类型注解，对代码中可能出现的问题进行检查，减少 bug 的出现。\n需要注意的是，通常我们不需要注释模块中的所有函数：\n\n1. 公共的 API 需要注释\n2. 在代码的安全性，清晰性和灵活性上进行权衡是否注释\n3. 对于容易出现类型相关的错误的代码进行注释\n4. 难以理解的代码请进行注释\n5. 若代码中的类型已经稳定，可以进行注释. 对于一份成熟的代码，多数情况下，即使注释了所有的函数，也不会丧失太多的灵活性.\n\n#### 如何写类型注解\n\n1. 函数 / 方法类型注解，通常不对 self 和 cls 注释。\n\n   ```python\n   from typing import Optional, List, Tuple\n\n   # 全部位于一行\n   def my_method(self, first_var: int) -> int:\n       pass\n\n   # 另起一行\n   def my_method(\n           self, first_var: int,\n           second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:\n       pass\n\n   # 单独成行（具体的应用场合与行宽有关，建议结合 yapf 自动化格式使用）\n   def my_method(\n       self, first_var: int, second_var: float\n   ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:\n       pass\n\n   # 引用尚未被定义的类型\n   class MyClass:\n       def __init__(self,\n                    stack: List[\"MyClass\"]) -> None:\n           pass\n   ```\n\n   注：类型注解中的类型可以是 Python 内置类型，也可以是自定义类，还可以使用 Python 提供的 wrapper 类对类型注解进行装饰，一些常见的注解如下：\n\n   ```python\n   # 数值类型\n   from numbers import Number\n\n   # 可选类型，指参数可以为 None\n   from typing import Optional\n   def foo(var: Optional[int] = None):\n       pass\n\n   # 联合类型，指同时接受多种类型\n   from typing import Union\n   def foo(var: Union[float, str]):\n       pass\n\n   from typing import Sequence  # 序列类型\n   from typing import Iterable  # 可迭代类型\n   from typing import Any  # 任意类型\n   from typing import Callable  # 可调用类型\n\n   from typing import List, Dict  # 列表和字典的泛型类型\n   from typing import Tuple  # 元组的特殊格式\n   # 虽然在 Python 3.9 中，list, tuple 和 dict 本身已支持泛型，但为了支持之前的版本\n   # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型\n   # 另外，在对参数类型进行注解时，尽量使用 Sequence & Iterable & Mapping\n   # List, Tuple, Dict 主要用于返回值类型注解\n   # 参见 https://docs.python.org/3/library/typing.html#typing.List\n   ```\n\n2. 变量类型注解，一般用于难以直接推断其类型时\n\n   ```python\n   # Recommend: 带类型注解的赋值\n   a: Foo = SomeUndecoratedFunction()\n   a: List[int]: [1, 2, 3]         # List 只支持单一类型泛型，可使用 Union\n   b: Tuple[int, int] = (1, 2)     # 长度固定为 2\n   c: Tuple[int, ...] = (1, 2, 3)  # 变长\n   d: Dict[str, int] = {'a': 1, 'b': 2}\n\n   # Not Recommend：行尾类型注释\n   # 虽然这种方式被写在了 Google 开源指南中，但这是一种为了支持 Python 2.7 版本\n   # 而补充的注释方式，鉴于我们只支持 Python 3, 为了风格统一，不推荐使用这种方式。\n   a = SomeUndecoratedFunction()  # type: Foo\n   a = [1, 2, 3]  # type: List[int]\n   b = (1, 2, 3)  # type: Tuple[int, ...]\n   c = (1, \"2\", 3.5)  # type: Tuple[int, Text, float]\n   ```\n\n3. 泛型\n\n   上文中我们知道，typing 中提供了 list 和 dict 的泛型类型，那么我们自己是否可以定义类似的泛型呢？\n\n   ```python\n   from typing import TypeVar, Generic\n\n   KT = TypeVar('KT')\n   VT = TypeVar('VT')\n\n   class Mapping(Generic[KT, VT]):\n       def __init__(self, data: Dict[KT, VT]):\n           self._data = data\n\n       def __getitem__(self, key: KT) -> VT:\n           return self._data[key]\n   ```\n\n   使用上述方法，我们定义了一个拥有泛型能力的映射类，实际用法如下：\n\n   ```python\n   mapping = Mapping[str, float]({'a': 0.5})\n   value: float = example['a']\n   ```\n\n   另外，我们也可以利用 TypeVar 在函数签名中指定联动的多个类型：\n\n   ```python\n   from typing import TypeVar, List\n\n   T = TypeVar('T')  # Can be anything\n   A = TypeVar('A', str, bytes)  # Must be str or bytes\n\n\n   def repeat(x: T, n: int) -> List[T]:\n       \"\"\"Return a list containing n references to x.\"\"\"\n       return [x]*n\n\n\n   def longest(x: A, y: A) -> A:\n       \"\"\"Return the longest of two strings.\"\"\"\n       return x if len(x) >= len(y) else y\n   ```\n\n更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。\n\n#### 类型注解检查工具\n\n[mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解，mypy 会检查传参、赋值等操作是否符合类型注解，从而避免可能出现的 bug。\n\n例如如下的一个  Python 脚本文件 test.py:\n\n```python\ndef foo(var: int) -> float:\n    return float(var)\n\na: str = foo('2.0')\nb: int = foo('3.0')  # type: ignore\n```\n\n运行 mypy test.py 可以得到如下检查结果，分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误，由于使用了 type: ignore 而被忽略了，只有部分特殊情况可能需要此类忽略。\n\n```\ntest.py:4: error: Incompatible types in assignment (expression has type \"float\", variable has type \"int\")\ntest.py:4: error: Argument 1 to \"foo\" has incompatible type \"str\"; expected \"int\"\nFound 2 errors in 1 file (checked 1 source file)\n```\n"
  },
  {
    "path": "docs/zh_cn/community/contributing.md",
    "content": "## 贡献代码\n\n欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于\n\n**修复错误**\n\n修复代码实现错误的步骤如下：\n\n1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。\n2. 修复错误并补充相应的单元测试，提交拉取请求。\n\n**新增功能或组件**\n\n1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。\n2. 实现新增功能并添单元测试，提交拉取请求。\n\n**文档补充**\n\n修复文档可以直接提交拉取请求\n\n添加文档或将文档翻译成其他语言步骤如下\n\n1. 提交 issue，确认添加文档的必要性。\n2. 添加文档，提交拉取请求。\n\n### 拉取请求工作流\n\n如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)\n\n#### 1. 复刻仓库\n\n当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png\" width=\"1200\">\n\n将代码克隆到本地\n\n```shell\ngit clone git@github.com:{username}/mmcv.git\n```\n\n添加原代码库为上游代码库\n\n```bash\ngit remote add upstream git@github.com:open-mmlab/mmcv\n```\n\n检查 remote 是否添加成功，在终端输入 `git remote -v`\n\n```bash\norigin\tgit@github.com:{username}/mmcv.git (fetch)\norigin\tgit@github.com:{username}/mmcv.git (push)\nupstream\tgit@github.com:open-mmlab/mmcv (fetch)\nupstream\tgit@github.com:open-mmlab/mmcv (push)\n```\n\n```{note}\n这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。\n```\n\n#### 2. 配置 pre-commit\n\n在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:\n\n```shell\npip install -U pre-commit\npre-commit install\n```\n\n检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：\n\n```shell\npre-commit run --all-files\n```\n\n<img src=\"https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png\" width=\"1200\">\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png\" width=\"1200\">\n\n```{note}\n如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源\n\npre-commit install -c .pre-commit-config-zh-cn.yaml\n\npre-commit run --all-files -c .pre-commit-config-zh-cn.yaml\n```\n\n如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。\n\n如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png\" width=\"1200\">\n\n如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。\n\n```shell\ngit commit -m \"xxx\" --no-verify\n```\n\n#### 3. 创建开发分支\n\n安装完 pre-commit 之后，我们需要基于 main 创建开发分支，建议的分支命名规则为 `username/pr_name`。\n\n```shell\ngit checkout -b yhc/refactor_contributing_doc\n```\n\n在后续的开发中，如果本地仓库的 main 分支落后于 upstream 的 main 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令\n\n```shell\ngit pull upstream main\n```\n\n#### 4. 提交代码并在本地通过单元测试\n\n- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。\n\n- 提交的代码同样需要通过单元测试\n\n  ```shell\n  # 通过全量单元测试\n  pytest tests\n\n  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例\n  pytest tests/test_runner/test_runner.py\n  ```\n\n  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)\n\n- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。\n\n#### 5. 推送代码到远程\n\n代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支\n\n```shell\ngit push -u origin {branch_name}\n```\n\n这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。\n\n#### 6. 提交拉取请求（PR）\n\n(1) 在 GitHub 的 Pull request 界面创建拉取请求\n<img src=\"https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png\" width=\"1200\">\n\n(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png\" width=\"1200\">\n\n描述规范详见[拉取请求规范](#拉取请求规范)\n\n&#160;\n\n**注意事项**\n\n(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）\n\n(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png\" width=\"1200\">\n\n(c) 检查提交的 PR 是否通过 CI（集成测试）\n\n<img src=\"https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png\" width=\"1200\">\n\nMMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。\n\n(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。\n\n<img src=\"https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png\" width=\"1200\">\n\n所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。\n\n#### 7. 解决冲突\n\n随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：\n\n```shell\ngit fetch --all --prune\ngit rebase upstream/main\n```\n\n或者\n\n```shell\ngit fetch --all --prune\ngit merge upstream/main\n```\n\n如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。\n\n### 指引\n\n#### 单元测试\n\n如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖\n\n```shell\n# Linux\nsudo apt-get update -y\nsudo apt-get install -y libturbojpeg\nsudo apt-get install -y ffmpeg\n\n# Windows\nconda install ffmpeg\n```\n\n在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下\n\n```shell\npython -m coverage run -m pytest /path/to/test_file\npython -m coverage html\n# check file in htmlcov/index.html\n```\n\n#### 文档渲染\n\n在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。\n本地生成渲染后的文档的方法如下\n\n```shell\npip install -r requirements/docs.txt\ncd docs/zh_cn/\n# or docs/en\nmake html\n# check file in ./docs/zh_cn/_build/html/index.html\n```\n\n### 代码风格\n\n#### Python\n\n[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码\n\n- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装\n- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具\n- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具\n- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误\n- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具\n- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具\n\nyapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到\n\n通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，\n修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。\npre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。\n\npre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。\n\n更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。\n\n#### C++ and CUDA\n\nC++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)\n\n### 拉取请求规范\n\n1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题\n\n2. 一个`拉取请求`对应一个短期分支\n\n3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`\n\n   - Bad：实现 Faster R-CNN\n   - Acceptable：给 Faster R-CNN 添加一个 box head\n   - Good：给 box head 增加一个参数来支持自定义的 conv 层数\n\n4. 每次 Commit 时需要提供清晰且有意义 commit 信息\n\n5. 提供清晰且有意义的`拉取请求`描述\n\n   - 标题写明白任务名称，一般格式:\\[Prefix\\] Short description of the pull request (Suffix)\n   - prefix: 新增功能 \\[Feature\\], 修 bug \\[Fix\\], 文档相关 \\[Docs\\], 开发中 \\[WIP\\] (暂时不会被review)\n   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板\n   - 关联相关的`议题` (issue) 和其他`拉取请求`\n\n6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`\n"
  },
  {
    "path": "docs/zh_cn/community/pr.md",
    "content": "## 拉取请求\n\n本文档的内容已迁移到[贡献指南](contributing.md)。\n"
  },
  {
    "path": "docs/zh_cn/compatibility.md",
    "content": "### v2.0.0\n\nOpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。\n\nOpenMMLab 团队于 2023 年 4 月 6 日发布 MMCV [v2.0.0](https://github.com/open-mmlab/mmcv/releases/tag/v2.0.0)。在 2.x 版本中，它有以下重大变化：\n\n（1）删除了以下组件：\n\n- `mmcv.fileio` 模块，删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)。在需要使用 FileIO 的地方使用 mmengine 中的 FileIO 模块\n- `mmcv.runner`、`mmcv.parallel`、`mmcv.engine` 和 `mmcv.device`，删除于 PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216)\n- `mmcv.utils` 的所有类（例如 `Config` 和 `Registry`）和大部分函数，删除于 PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)，只保留少数和 mmcv 相关的函数\n- `mmcv.onnx`、`mmcv.tensorrt` 模块以及相关的函数，删除于 PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225)\n- 删除 MMCV 所有的根注册器并将类或者函数注册到 MMEngine 的[根注册器](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py)\n\n（2）新增了 [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) 数据变换模块\n\n（3）在 PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235) 中将包名 **mmcv** 重命名为 **mmcv-lite**、 **mmcv-full** 重命名为 **mmcv**。此外，将环境变量 `MMCV_WITH_OPS` 的默认值从 0 改为 1\n\n<table class=\"docutils\">\n<thead>\n  <tr>\n    <th align=\"center\">MMCV < 2.0</th>\n    <th align=\"center\">MMCV >= 2.0 </th>\n<tbody>\n  <tr>\n  <td valign=\"top\">\n\n```bash\n# 包含算子，因为 mmcv-full 的最高版本小于 2.0.0，所以无需加版本限制\npip install openmim\nmim install mmcv-full\n\n# 不包含算子\npip install openmim\nmim install \"mmcv < 2.0.0\"\n```\n\n</td>\n  <td valign=\"top\">\n\n```bash\n# 包含算子\npip install openmim\nmim install mmcv\n\n# 不包含算子，因为 mmcv-lite 的起始版本为 2.0.0，所以无需加版本限制\npip install openmim\nmim install mmcv-lite\n```\n\n</td>\n</tr>\n</thead>\n</table>\n\n### v1.3.18\n\n部分自定义算子对于不同的设备有不同实现，为此添加的大量宏命令与类型检查使得代码变得难以维护。例如：\n\n```c++\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(rois);\n    CHECK_CUDA_INPUT(output);\n    CHECK_CUDA_INPUT(argmax_y);\n    CHECK_CUDA_INPUT(argmax_x);\n\n    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,\n                           aligned_height, aligned_width, spatial_scale,\n                           sampling_ratio, pool_mode, aligned);\n#else\n    AT_ERROR(\"RoIAlign is not compiled with GPU support\");\n#endif\n  } else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(rois);\n    CHECK_CPU_INPUT(output);\n    CHECK_CPU_INPUT(argmax_y);\n    CHECK_CPU_INPUT(argmax_x);\n    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n  }\n```\n\n为此我们设计了注册与分发的机制以更好的管理这些算子实现。\n\n```c++\n\nvoid ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned);\n\nvoid roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignForwardCUDAKernelLauncher(\n      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,\n      spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\n// 注册算子的cuda实现\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);\n\n// roi_align.cpp\n// 使用dispatcher根据参数中的Tensor device类型对实现进行分发\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,\n                       argmax_x, aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, pool_mode, aligned);\n}\n\n```\n\n### v1.3.11\n\n为了灵活地支持更多的后端和硬件，例如 `NVIDIA GPUs` 、`AMD GPUs`，我们重构了 `mmcv/ops/csrc` 目录。注意，这次重构不会影响 API 的使用。更多相关信息，请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。\n\n原始的目录结构如下所示\n\n```\n.\n├── common_cuda_helper.hpp\n├── ops_cuda_kernel.cuh\n├── pytorch_cpp_helper.hpp\n├── pytorch_cuda_helper.hpp\n├── parrots_cpp_helper.hpp\n├── parrots_cuda_helper.hpp\n├── parrots_cudawarpfunction.cuh\n├── onnxruntime\n│   ├── onnxruntime_register.h\n│   ├── onnxruntime_session_options_config_keys.h\n│   ├── ort_mmcv_utils.h\n│   ├── ...\n│   ├── onnx_ops.h\n│   └── cpu\n│       ├── onnxruntime_register.cpp\n│       ├── ...\n│       └── onnx_ops_impl.cpp\n├── parrots\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_cuda.cu\n│   ├── ops_parrots.cpp\n│   └── ops_pytorch.h\n├── pytorch\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_cuda.cu\n│   ├── pybind.cpp\n└── tensorrt\n    ├── trt_cuda_helper.cuh\n    ├── trt_plugin_helper.hpp\n    ├── trt_plugin.hpp\n    ├── trt_serialize.hpp\n    ├── ...\n    ├── trt_ops.hpp\n    └── plugins\n        ├── trt_cuda_helper.cu\n        ├── trt_plugin.cpp\n        ├── ...\n        ├── trt_ops.cpp\n        └── trt_ops_kernel.cu\n```\n\n重构之后，它的结构如下所示\n\n```\n.\n├── common\n│   ├── box_iou_rotated_utils.hpp\n│   ├── parrots_cpp_helper.hpp\n│   ├── parrots_cuda_helper.hpp\n│   ├── pytorch_cpp_helper.hpp\n│   ├── pytorch_cuda_helper.hpp\n│   └── cuda\n│       ├── common_cuda_helper.hpp\n│       ├── parrots_cudawarpfunction.cuh\n│       ├── ...\n│       └── ops_cuda_kernel.cuh\n├── onnxruntime\n│   ├── onnxruntime_register.h\n│   ├── onnxruntime_session_options_config_keys.h\n│   ├── ort_mmcv_utils.h\n│   ├── ...\n│   ├── onnx_ops.h\n│   └── cpu\n│       ├── onnxruntime_register.cpp\n│       ├── ...\n│       └── onnx_ops_impl.cpp\n├── parrots\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_parrots.cpp\n│   └── ops_pytorch.h\n├── pytorch\n│   ├── info.cpp\n│   ├── pybind.cpp\n│   ├── ...\n│   ├── ops.cpp\n│   └── cuda\n│       ├── ...\n│       └── ops_cuda.cu\n└── tensorrt\n    ├── trt_cuda_helper.cuh\n    ├── trt_plugin_helper.hpp\n    ├── trt_plugin.hpp\n    ├── trt_serialize.hpp\n    ├── ...\n    ├── trt_ops.hpp\n    └── plugins\n        ├── trt_cuda_helper.cu\n        ├── trt_plugin.cpp\n        ├── ...\n        ├── trt_ops.cpp\n        └── trt_ops_kernel.cu\n```\n"
  },
  {
    "path": "docs/zh_cn/conf.py",
    "content": "#\n# Configuration file for the Sphinx documentation builder.\n#\n# This file does only contain a selection of the most common options. For a\n# full list see the documentation:\n# http://www.sphinx-doc.org/en/master/config\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport sys\n\nimport pytorch_sphinx_theme\nfrom sphinx.builders.html import StandaloneHTMLBuilder\n\nsys.path.insert(0, os.path.abspath('../..'))\n\nversion_file = '../../mmcv/version.py'\nwith open(version_file) as f:\n    exec(compile(f.read(), version_file, 'exec'))\n__version__ = locals()['__version__']\n\n# -- Project information -----------------------------------------------------\n\nproject = 'mmcv'\ncopyright = '2018-2022, OpenMMLab'\nauthor = 'MMCV Authors'\n\n# The short X.Y version\nversion = __version__\n# The full version, including alpha/beta/rc tags\nrelease = __version__\n\n# -- General configuration ---------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#\n# needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\n\nextensions = [\n    'sphinx.ext.autodoc',\n    'sphinx.ext.autosummary',\n    'sphinx.ext.intersphinx',\n    'sphinx.ext.napoleon',\n    'sphinx.ext.viewcode',\n    'sphinx.ext.autosectionlabel',\n    'sphinx_markdown_tables',\n    'myst_parser',\n    'sphinx_copybutton',\n]  # yapf: disable\n\nmyst_heading_anchors = 4\n\nmyst_enable_extensions = ['colon_fence']\n\n# Configuration for intersphinx\nintersphinx_mapping = {\n    'python': ('https://docs.python.org/3', None),\n    'numpy': ('https://numpy.org/doc/stable', None),\n    'torch': ('https://pytorch.org/docs/stable/', None),\n    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),\n}\n\nautodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']\nautosectionlabel_prefix_document = True\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\nsource_suffix = {\n    '.rst': 'restructuredtext',\n    '.md': 'markdown',\n}\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = 'zh_CN'\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\n# html_theme = 'sphinx_rtd_theme'\nhtml_theme = 'pytorch_sphinx_theme'\nhtml_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\nhtml_theme_options = {\n    'menu': [\n        {\n            'name': 'GitHub',\n            'url': 'https://github.com/open-mmlab/mmcv'\n        },\n    ],\n    # Specify the language of shared menu\n    'menu_lang': 'cn',\n}\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\nhtml_css_files = ['css/readthedocs.css']\n\n# Custom sidebar templates, must be a dictionary that maps document names\n# to template names.\n#\n# The default sidebars (for documents that don't match any pattern) are\n# defined by theme itself.  Builtin themes are using these templates by\n# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',\n# 'searchbox.html']``.\n#\n# html_sidebars = {}\n\n# -- Options for HTMLHelp output ---------------------------------------------\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'mmcvdoc'\n\n# -- Options for LaTeX output ------------------------------------------------\n\nlatex_elements = {\n    # The paper size ('letterpaper' or 'a4paper').\n    #\n    # 'papersize': 'letterpaper',\n\n    # The font size ('10pt', '11pt' or '12pt').\n    #\n    # 'pointsize': '10pt',\n\n    # Additional stuff for the LaTeX preamble.\n    #\n    # 'preamble': '',\n\n    # Latex figure (float) alignment\n    #\n    # 'figure_align': 'htbp',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',\n     'manual'),\n]\n\n# -- Options for manual page output ------------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]\n\n# -- Options for Texinfo output ----------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',\n     'One line description of project.', 'Miscellaneous'),\n]\n\n# -- Options for Epub output -------------------------------------------------\n\n# Bibliographic Dublin Core info.\nepub_title = project\n\n# The unique identifier of the text. This can be a ISBN number\n# or the project homepage.\n#\n# epub_identifier = ''\n\n# A unique identification for the text.\n#\n# epub_uid = ''\n\n# A list of files that should not be packed into the epub file.\nepub_exclude_files = ['search.html']\n\n# set priority when building html\nStandaloneHTMLBuilder.supported_image_types = [\n    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'\n]\n# -- Extension configuration -------------------------------------------------\n# Ignore >>> when copying code\ncopybutton_prompt_text = r'>>> |\\.\\.\\. '\ncopybutton_prompt_is_regexp = True\n"
  },
  {
    "path": "docs/zh_cn/docutils.conf",
    "content": "[html writers]\ntable_style: colwidths-auto\n"
  },
  {
    "path": "docs/zh_cn/faq.md",
    "content": "## 常见问题\n\n在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，\n欢迎随时丰富这个列表。\n\n### 安装问题\n\n- KeyError: \"xxx: 'yyy is not in the zzz registry'\"\n\n  只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件，更多详情请查看 [KeyError: \"MaskRCNN: 'RefineRoIHead is not in the models registry'\"](https://github.com/open-mmlab/mmdetection/issues/5974)。\n\n- \"No module named 'mmcv.ops'\"; \"No module named 'mmcv.\\_ext'\"\n\n  1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv\n  2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full\n\n- \"invalid device function\" 或者 \"no kernel image is available for execution\"\n\n  1. 检查 GPU 的 CUDA 计算能力\n  2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的，您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)\n  3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它\n\n- \"undefined symbol\" 或者 \"cannot open xxx.so\"\n\n  1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致\n  2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致\n  3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同\n\n- \"RuntimeError: CUDA error: invalid configuration argument\"\n\n  这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)\n  的值并重新编译 mmcv。\n\n- \"RuntimeError: nms is not compiled with GPU support\"\n\n  这个错误是由于您的 CUDA 环境没有正确安装。\n  您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。\n\n- \"Segmentation fault\"\n\n  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \\< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 \"segmentation fault\" 并且切换到 GCC 5.4 就可以解决问题\n  2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True\n     ```shell\n     python -c 'import torch; print(torch.cuda.is_available())'\n     ```\n  3. 如果 `torch` 安装成功，那么检查 MMCV 是否安装成功。输入以下命令，如果没有报错说明 mmcv-full 安装成。\n     ```shell\n     python -c 'import mmcv; import mmcv.ops'\n     ```\n  4. 如果 MMCV 与 PyTorch 都安装成功了，则可以使用 `ipdb` 设置断点或者使用 `print` 函数，分析是哪一部分的代码导致了 `segmentation fault`\n\n- \"libtorch_cuda_cu.so: cannot open shared object file\"\n\n  `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件，但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。\n\n- \"fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!\"\n\n  如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2，您很可能会遇到这个问题 `\"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2\\include\\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!\"`，您可以尝试使用低版本的 Microsoft Visual Studio，例如 vs2017。\n\n- \"error: member \"torch::jit::detail::ModulePolicy::all_slots\" may not be initialized\"\n\n  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0，您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member \"torch::jit::detail::ModulePolicy::all_slots\" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member \"torch::jit::detail::AttributePolicy::all_slots\" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。\n\n- \"error: a member with an in-class initializer must be const\"\n\n  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0，您很可能会遇到这个问题 `\"- torch/include\\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const\"`. 解决这个问题的方法是将 `torch/include\\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。\n\n- \"error: member \"torch::jit::ProfileOptionalOp::Kind\" may not be initialized\"\n\n  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0，您很可能会遇到这个问题 `torch/include\\torch/csrc/jit/ir/ir.h(1347): error: member \"torch::jit::ProfileOptionalOp::Kind\" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件：\n\n  - 删除 `torch/include\\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;`\n  - 将 `torch\\include\\pybind11\\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }`\n  - 将 `torch/include\\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const`\n\n  更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。\n\n- MMCV 和 MMDetection 的兼容性问题；\"ConvWS is already registered in conv layer\"\n\n  请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。\n\n### 使用问题\n\n- \"RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one\"\n\n  1. 这个错误是因为有些参数没有参与 loss 的计算，可能是代码中存在多个分支，导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。\n  2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`，或者手动查找哪些参数没有用到。\n\n- \"RuntimeError: Trying to backward through the graph a second time\"\n\n  不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`，这会导致 `loss.backward()` 被调用两次，于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。\n"
  },
  {
    "path": "docs/zh_cn/get_started/api_reference.md",
    "content": "# 接口对照表\n\n由于 MMCV v1.x 升级到 MMCV v2.x 时移除了 `mmcv.fileio`，`mmcv.runner`，`mmcv.parallel`，`mmcv.engine`，`mmcv.device` 模块，以及 `mmcv.utils` 中的所有类和大部分函数，分别删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)，PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216)，PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)。因此我们提供了如下的接口对照表，以便于大家快速查找迁移后的接口。\n\n## 相关讨论\n\n- [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216)\n- [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282)\n- [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934)\n- [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356)\n- [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594)\n\n## `mmcv.fileio`\n\n| MMCV                                              | MMCV URL                                                                              | MMEngine                                                    | MMEngine URL                                                                                   |\n| ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |\n| mmcv.fileio.file_client.BaseStorageBackend        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.base.BaseStorageBackend            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py              |\n| mmcv.fileio.file_client.CephBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             |                                                             |                                                                                                |\n| mmcv.fileio.file_client.PetrelBackend             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.petrel_backend.PetrelBackend       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py    |\n| mmcv.fileio.file_client.MemcachedBackend          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py |\n| mmcv.fileio.file_client.LmdbBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.lmdb_backend.LmdbBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py      |\n| mmcv.fileio.file_client.HardDiskBackend           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.HardDiskBackend                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |\n| mmcv.fileio.file_client.HTTPBackend               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.backends.http_backend.HTTPBackend           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py      |\n| mmcv.fileio.file_client.FileClient                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py             | mmengine.fileio.file_client.FileClient                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py                |\n| mmcv.fileio.io.load                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.load                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |\n| mmcv.fileio.io.dump                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.io.dump                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py                         |\n| mmcv.fileio.io.\\_register_handler                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.\\_register_handler                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |\n| mmcv.fileio.io.register_handler                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py                      | mmengine.fileio.handlers.register_handler                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py    |\n| mmcv.fileio.parse.list_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.list_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |\n| mmcv.fileio.parse.dict_from_file                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py                   | mmengine.fileio.parse.dict_from_file                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py                      |\n| mmcv.fileio.handlers.base.BaseFileHandler         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py           | mmengine.fileio.handlers.base.BaseFileHandler               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py              |\n| mmcv.fileio.handlers.json_handler.set_default     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.set_default           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |\n| mmcv.fileio.handlers.json_handler.JsonHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py   | mmengine.fileio.handlers.json_handler.JsonHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py      |\n| mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py    |\n| mmcv.fileio.handlers.yaml_handler.YamlHandler     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py   | mmengine.fileio.handlers.yaml_handler.YamlHandler           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py      |\n\n## `mmcv.runner`\n\n| MMCV                                                                  | MMCV URL                                                                                    | MMEngine                                                                                                                                | MMEngine URL                                                                                                                                                                                           |\n| --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |\n| mmcv.runner.hooks.logger.base.LoggerHook                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py             | mmengine.hooks.logger_hook.LoggerHook                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py                                                                                                                         |\n| mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py          | 相似功能：mmengine.visualization.vis_backend.ClearMLVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py          | 相似功能：mmengine.visualization.vis_backend.DVCLiveVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py           | 相似功能：mmengine.visualization.vis_backend.MLflowVisBackend                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py          | 相似功能：mmengine.visualization.vis_backend.NeptuneVisBackend                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.pavi.PaviLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.segmind.SegmindLoggerHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py          |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py      | 相似功能：mmengine.visualization.vis_backend.TensorboardVisBackend                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.logger.text.TextLoggerHook                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.logger.wandb.WandbLoggerHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py            | 相似功能：mmengine.visualization.vis_backend.WandbVisBackend                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py                                                                                                                 |\n| mmcv.runner.hooks.checkpoint.CheckpointHook                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py              | mmengine.hooks.checkpoint_hook.CheckpointHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.closure.ClosureHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py                 |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.ema.EMAHook                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py                     | mmengine.hooks.ema_hook.EMAHook                                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py                                                                                                                            |\n| mmcv.runner.hooks.evaluation.EvalHook                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.evaluation.DistEvalHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py              | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py                                                                                                                     |\n| mmcv.runner.hooks.hook.HOOKS                                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.registry.root.HOOKS                                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.hooks.hook.Hook                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py                    | mmengine.hooks.hook.Hook                                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py                                                                                                                                |\n| mmcv.runner.hooks.iter_timer.IterTimerHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py              | mmengine.hooks.iter_timer_hook.IterTimerHook                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py                                                                                                                     |\n| mmcv.runner.hooks.lr_updater.LrUpdaterHook                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin                                                                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ConstantLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.StepLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.StepLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.ExponentialLR                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.PolyLR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.InvLrUpdaterHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.get_position_from_periods                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.lr_updater.annealing_cos                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_annealing_cos                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.annealing_linear                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_annealing_linear                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.lr_updater.format_param                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py              | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\\_format_param                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py                                                                                                              |\n| mmcv.runner.hooks.memory.EmptyCacheHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py                  | mmengine.hoos.empty_cache_hook.EmptyCacheHook                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py                                                                                                                    |\n| mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin                                                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.StepMomentum                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py        |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.OptimizerHook                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | mmengine.optimizer.optimizer_wrapper.OptimWrapper                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py                                                                                                         |\n| mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.optimizer.Fp16OptimizerHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py               |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.hooks.profiler.ProfilerHook                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py                | mmengine.hooks.profiler_hook.ProfilerHook                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py                                                                                                                       |\n| mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py            | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook                                                                                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py                                                                                                                   |\n| mmcv.runner.hooks.sync_buffer.SyncbuffersHook                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py             | mmengine.hooks.sync_buffer_hook.SyncBufferHook                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py                                                                                                                    |\n| mmcv.runner.optimizer.builder.OPTIMIZERS                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.registry.root.OPTIMIZERS                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.register_torch_optimizers               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             | mmengine.optim.optimizer.builder.register_torch_optimizers                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py                                                                                                                   |\n| mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.build_optimizer_constructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.builder.build_optimizer                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.base_module.BaseModule                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.BaseModule                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.Sequential                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.Sequential                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.ModuleList                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleList                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_module.ModuleDict                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py                   | mmengine.model.base_module.ModuleDict                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py                                                                                                                         |\n| mmcv.runner.base_runner.BaseRunner                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py                   | mmengine.runner.runner.Runner                                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py                                                                                                                             |\n| mmcv.runner.builder.RUNNERS                                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNERS                                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.builder.RUNNER_BUILDERS                                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       | mmengine.registry.root.RUNNER_CONSTRUCTORS                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                                                                                                                             |\n| mmcv.runner.builder.build_runner_constructor                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.builder.build_runner                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py                       |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.checkpoint.ENV_MMCV_HOME                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_MMENGINE_HOME                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.DEFAULT_CACHE_HOME                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_get_mmcv_home                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_get_mmengine_home                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_state_dict                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_state_dict                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_torchvision_models                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_torchvision_models                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_external_models                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_external_models                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_mmcls_models                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_mmcls_models                                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_deprecated_model_names                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_deprecated_model_names                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_process_mmcls_checkpoint                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_process_mmcls_checkpoint                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.CheckpointLoader                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.CheckpointLoader                                                                                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_local                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_local                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_http                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_http                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_pavi                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_pavi                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_ceph                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_ceph                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_torchvision                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_torchvision                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_openmmlab                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_openmmlab                                                                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_from_mmcls                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_from_mmcls                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_load_checkpoint                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_load_checkpoint                                                                                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_load_checkpoint_with_prefix                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_load_checkpoint_with_prefix                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.load_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.load_checkpoint                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.weights_to_cpu                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.weights_to_cpu                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.\\_save_to_state_dict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.\\_save_to_state_dict                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.get_state_dict                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.get_state_dict                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.checkpoint.save_checkpoint                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py                    | mmengine.runner.checkpoint.save_checkpoint                                                                                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py                                                                                                                         |\n| mmcv.runner.default_coonstructor.DefaultRunnerConstructor             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py           |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_find_free_port                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_is_free_port                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.init_dist                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.init_dist                                                                                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_pytorch                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_pytorch                                                                                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_mpi                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_mpi                                                                                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.\\_init_dist_slurm                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.\\_init_dist_slurm                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.get_dist_info                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.get_dist_info                                                                                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.master_only                                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    | mmengine.dist.utils.master_only                                                                                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py                                                                                                                                |\n| mmcv.runner.dist_utils.allreduce_params                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.dist_utils.\\_allreduce_coalesced                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.epoch_based_runner.EpochBasedRunner                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            | mmengine.runner.loops.EpochBasedTrainLoop                                                                                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |\n| mmcv.runner.epoch_based_runner.Runner                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py            |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.fp16_utils.cast_tensor_type                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.auto_fp16                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.force_fp32                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.allreduce_grads                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.wrap_fp16_model                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.patch_norm_fp32                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.patch_forward_method                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.fp16_utils.LossScaler                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py                    | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py |\n| mmcv.runner.iter_based_runner.IterLoader                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.iter_based_runner.IterBasedRunner                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py             | mmengine.runner.loops.IterBasedTrainLoop                                                                                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py                                                                                                                              |\n| mmcv.runner.log_buffer.LogBuffer                                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py                    |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.priority.Priority                                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runer.priority.Priority                                                                                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |\n| mmcv.runner.priority.get_priority                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py                      | mmengine.runner.priority.get_priority                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py                                                                                                                           |\n| mmcv.runner.utils.get_host_info                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.utils.get_time_str                                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.utils.obj_from_dict                                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         |                                                                                                                                         |                                                                                                                                                                                                        |\n| mmcv.runner.utils.set_random_seed                                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py                         | mmengine.runner.utils.set_random_seed                                                                                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py                                                                                                                              |\n\n## `mmcv.parallel`\n\n| MMCV                                                           | MMCV URL                                                                       | MMEngine                                                        | MMEngine URL                                                                              |\n| -------------------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |\n| mmcv.parallel.\\_functions.scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |\n| mmcv.parallel.\\_functions.synchronize_stream                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |\n| mmcv.parallel.\\_functions.get_input_device                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |\n| mmcv.parallel.\\_functions.Scatter                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py     |                                                                 |                                                                                           |\n| mmcv.parallel.collate.collate                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py        |                                                                 |                                                                                           |\n| mmcv.parallel.data_container.assert_tensor_type                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py |                                                                 |                                                                                           |\n| mmcv.parallel.data_container.DataContainer                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | 相似功能：mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py |\n| mmcv.parallel.data_parallel.MMDataParallel                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py  |                                                                 |                                                                                           |\n| mmcv.parallel.distributed.MMDistributedDataParallel            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |\n| mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py    | mmengine.model.wrappers.distributed.MMDistributedDataParallel   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py   |\n| mmcv.parallel.registry.MODULE_WRAPPERS                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py       | mmengine.registry.root.MODEL_WRAPPERS                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py                |\n| mmcv.parallel.scatter_gather.scatter                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                 |                                                                                           |\n| mmcv.parallel.scatter_gather.scatter_kwargs                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py |                                                                 |                                                                                           |\n| mmcv.parallel.utils.is_module_wrapper                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py          | mmengine.model.wrappers.utils.is_model_wrapper                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py         |\n\n## `mmcv.engine`\n\n| MMCV                                 | MMCV URL                                                           | MMEngine | MMEngine URL |\n| ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ |\n| mmcv.engine.test.single_gpu_test     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.multi_gpu_test      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n| mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py |          |              |\n\n## `mmcv.device`\n\n| MMCV                                      | MMCV URL                                                                     | MMEngine                         | MMEngine URL                                                              |\n| ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- |\n| mmcv.device.ipu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu               |                                  |                                                                           |\n| mmcv.device.mlu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu               |                                  |                                                                           |\n| mmcv.device.mps                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps               |                                  |                                                                           |\n| mmcv.device.npu                           | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu               |                                  |                                                                           |\n| mmcv.device.\\_functions.scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |\n| mmcv.device.\\_functions.Scatter           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py     |                                  |                                                                           |\n| mmcv.device.scatter_gather.scatter        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |\n| mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py |                                  |                                                                           |\n| mmcv.device.utils.get_device              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py          | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py |\n\n## `mmcv.utils`\n\n| MMCV                                                   | MMCV URL                                                                     | MMEngine                                                            | MMEngine URL                                                                                |\n| ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |\n| mmcv.utils.config.BASE_KEY                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.BASE_KEY                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DELETE_KEY                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DELETE_KEY                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DEPRECATION_KEY                      | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DEPRECATION_KEY                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.ConfigDict                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.ConfigDict                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.add_args                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.add_args                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.Config                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.Config                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.config.DictAction                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py          | mmengine.config.config.DictAction                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py                  |\n| mmcv.utils.device_type.is_ipu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |\n| mmcv.utils.device_type.IS_IPU_AVAILABLE                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     |                                                                     |                                                                                             |\n| mmcv.utils.device_type.is_mlu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mlu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.device_type.is_mps_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_mps_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.device_type.is_npu_available                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py     | mmengine.device.utils.is_npu_available                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.hub.\\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\\_is_legacy_zip_format                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.hub.\\_legacy_zip_load                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.\\_legacy_zip_load                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.hub.load_url                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py             | mmengine.utils.dl_utils.hub.load_url                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py             |\n| mmcv.utils.logging.logger_initialized                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.logging.get_logger                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.logging.print_log                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py         |                                                                     |                                                                                             |\n| mmcv.utils.misc.\\_ntuple                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_ntuple                                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_1tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_1tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_2tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_2tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_3tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_3tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_4tuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_4tuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.to_ntuple                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.to_ntuple                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_str                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_str                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.import_modules_from_strings            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.import_modules_from_strings                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.iter_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.iter_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.list_cast                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.list_cast                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.tuple_cast                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.tuple_cast                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_seq_of                              | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_seq_of                                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_list_of                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_list_of                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_tuple_of                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_tuple_of                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.slice_list                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.slice_list                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.concat_list                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.concat_list                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.check_prerequisites                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.check_prerequisites                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.\\_check_py_package                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_check_py_package                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.\\_check_executable                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.\\_check_executable                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.requires_package                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_package                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.requires_executable                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.requires_executable                             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.deprecated_api_warning                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.deprecated_api_warning                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.is_method_overridden                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.is_method_overridden                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.misc.has_method                             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py            | mmengine.utils.misc.has_method                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py                     |\n| mmcv.utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.is_cuda_available           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available                            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py                   |\n| mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py |                                                                     |                                                                                             |\n| mmcv.utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_cuda_home             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_cuda_home             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_conv                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_conv                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_dataloader            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_dataloader            | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_extension             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_extension             | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_pool                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_pool                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.\\_get_norm                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\\_get_norm                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py |\n| mmcv.utils.path.is_filepath                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.is_filepath                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.fopen                                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.fopen                                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.check_file_exist                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.check_file_exist                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.mkdir_or_exist                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.mkdir_or_exist                                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.symlink                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.symlink                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.scandir                                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.scandir                                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.path.find_vcs_root                          | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py            | mmengine.utils.path.find_vcs_root                                   | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py                     |\n| mmcv.utils.progressbar.ProgressBar                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.ProgressBar                              | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_progress                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_progress                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.init_pool                       | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.init_pool                                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_parallel_progress         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_parallel_progress                  | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.progressbar.track_iter_progress             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py     | mmengine.utils.progressbar.track_iter_progress                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py              |\n| mmcv.utils.registry.build_from_cfg                     | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.build_functions.build_from_cfg                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py       |\n| mmcv.utils.registry.Registry                           | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py        | mmengine.registry.registry.Registry                                 | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py              |\n| mmcv.utils.seed.worker_init_fn                         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py            | mmengine.dataset.utils.worker_init_fn                               | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py                  |\n| mmcv.utils.testing.check_python_script                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.check_python_script                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.\\_any                               | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.\\_any                                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_dict_contains_subset         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_contains_subset                | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_attrs_equal                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_attrs_equal                         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_dict_has_keys                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_dict_has_keys                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_keys_equal                   | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_keys_equal                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_is_norm_layer                | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_is_norm_layer                       | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.testing.assert_params_all_zeros             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py         | mmengine.testing.compare.assert_params_all_zeros                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py                |\n| mmcv.utils.timer.TimerError                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.TimerError                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.Timer                                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.Timer                                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.\\_g_timers                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.\\_g_timers                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.timer.check_time                            | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py           | mmengine.utils.timer.check_time                                     | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py                    |\n| mmcv.utils.torch_ops.\\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.\\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |\n| mmcv.utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py       | mmengine.utils.dl_utils.torch_ops.torch_meshgrid                    | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py       |\n| mmcv.utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py           | mmengine.utils.dl_utils.trace.is_jit_tracing                        | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py           |\n| mmcv.utils.version_utils.digit_version                 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.digit_version                          | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n| mmcv.utils.version_utils.\\_minimal_ext_cmd             | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.\\_minimal_ext_cmd                      | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n| mmcv.utils.version_utils.get_git_hash                  | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py   | mmengine.utils.version_utils.get_git_hash                           | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py            |\n\n## `mmcv.cnn`\n\n| MMCV                                         | MMCV URL                                                                 | MMEngine                                   | MMEngine URL                                                             |\n| -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ |\n| mmcv.cnn.utils.sync_bn.\\_BatchNormXd         | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\\_BatchNormXd         | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |\n| mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py |\n\n## `mmcv.model_zoo`\n\n| MMCV                                 | MMCV URL                                                                            | MMEngine                           | MMEngine URL                                                                        |\n| ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- |\n| mmcv.model_zoo.deprecated.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json       | mmengine.hub.deprecated.json       | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json       |\n| mmcv.model_zoo.mmcls.json            | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json            | mmengine.hub.mmcls.json            | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json            |\n| mmcv.model_zoo.open_mmlab.json       | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json       | mmengine.hub.openmmlab.json        | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json        |\n| mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json |\n"
  },
  {
    "path": "docs/zh_cn/get_started/article.md",
    "content": "## 解读文章汇总\n\n这篇文章汇总了 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章（更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse)），如果您有推荐的文章（不一定是 OpenMMLab 发布的文章，可以是自己写的文章），非常欢迎提 [Pull Request](http://127.0.0.1:5501/mmcv/docs/zh_cn/_build/html/community/pr.html) 添加到这里。\n\n### MMCV 解读文章\n\n#### 框架解读\n\n- [MMCV 核心组件分析(一)：整体概述](https://zhuanlan.zhihu.com/p/336081587)\n- [MMCV 核心组件分析(二)：FileHandler](https://zhuanlan.zhihu.com/p/336097883)\n- [MMCV 核心组件分析(三): FileClient](https://zhuanlan.zhihu.com/p/339190576)\n- [MMCV 核心组件分析(四): Config](https://zhuanlan.zhihu.com/p/346203167)\n- [MMCV 核心组件分析(五): Registry](https://zhuanlan.zhihu.com/p/355271993)\n- [MMCV 核心组件分析(六): Hook](https://zhuanlan.zhihu.com/p/355272220)\n- [MMCV 核心组件分析(七): Runner](https://zhuanlan.zhihu.com/p/355272459)\n- [MMCV Hook 食用指南](https://zhuanlan.zhihu.com/p/448600739)\n- [PyTorch & MMCV Dispatcher 机制解析](https://zhuanlan.zhihu.com/p/451671838)\n\n#### 工具解读\n\n- [训练可视化工具哪款是你的菜？MMCV一行代码随你挑](https://zhuanlan.zhihu.com/p/387078211)\n\n#### 安装指南\n\n- [久等了！Windows 平台 MMCV 的预编译包终于来了！](https://zhuanlan.zhihu.com/p/441653536)\n- [Windows 环境从零安装 mmcv-full](https://zhuanlan.zhihu.com/p/434491590)\n\n#### 知乎问答\n\n- [深度学习科研，如何高效进行代码和实验管理？](https://www.zhihu.com/question/269707221/answer/2480772257)\n- [深度学习方面的科研工作中的实验代码有什么规范和写作技巧？如何妥善管理实验数据？](https://www.zhihu.com/question/268193800/answer/2586000037)\n\n### 下游算法库解读文章\n\n- [MMDetection](https://mmdetection.readthedocs.io/zh_CN/latest/article.html)\n\n### PyTorch 解读文章\n\n- [PyTorch1.11 亮点一览：TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256)\n- [PyTorch1.12 亮点一览：DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554)\n- [PyTorch 源码解读之 nn.Module：核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841)\n- [PyTorch 源码解读之 torch.autograd：梯度计算详解](https://zhuanlan.zhihu.com/p/321449610)\n- [PyTorch 源码解读之 torch.utils.data：解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513)\n- [PyTorch 源码解读之 torch.optim：优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754)\n- [PyTorch 源码解读之 DP & DDP：模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042)\n- [PyTorch 源码解读之 BN & SyncBN：BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517)\n- [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267)\n- [PyTorch 源码解读之 cpp_extension：揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597)\n- [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354)\n- [PyTorch 源码解读之分布式训练了解一下？](https://zhuanlan.zhihu.com/p/361314953)\n- [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544)\n\n### 其他\n\n- [困扰我 48 小时的深拷贝，今天终于...](https://zhuanlan.zhihu.com/p/470892209)\n- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077)\n- [是谁偷偷动了我的 logger](https://zhuanlan.zhihu.com/p/481383590)\n- [三句话，让 logger 言听计从](https://zhuanlan.zhihu.com/p/487524917)\n- [Logging 不为人知的二三事](https://zhuanlan.zhihu.com/p/502610682)\n- [Type Hints 入门教程，让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398)\n- [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627)\n- [OpenMMLab 支持 IPU 训练芯片](https://zhuanlan.zhihu.com/p/517527926)\n- [基于 MMCV 走上开源大佬之路？](https://zhuanlan.zhihu.com/p/391144979)\n"
  },
  {
    "path": "docs/zh_cn/get_started/build.md",
    "content": "## 从源码编译 MMCV\n\n### 编译 mmcv\n\n在编译 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证\n\n```bash\npython -c 'import torch;print(torch.__version__)'\n```\n\n:::{note}\n\n- 如果克隆代码仓库的速度过慢，可以使用以下命令克隆（注意：gitee 的 mmcv 不一定和 github 的保持一致，因为每天只同步一次）\n\n```bash\ngit clone https://gitee.com/open-mmlab/mmcv.git\n```\n\n- 如果打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。\n\n- 如果编译过程安装依赖库的时间过长，可以[设置 pypi 源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/)\n\n```bash\npip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n```\n\n:::\n\n#### 在 Linux 上编译 mmcv\n\n| TODO: 视频教程\n\n1. 克隆代码仓库\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. 安装 `ninja` 和 `psutil` 以加快编译速度\n\n   ```bash\n   pip install -r requirements/optional.txt\n   ```\n\n3. 检查 nvcc 的版本（要求大于等于 9.2，如果没有 GPU，可以跳过）\n\n   ```bash\n   nvcc --version\n   ```\n\n   上述命令如果输出以下信息，表示 nvcc 的设置没有问题，否则需要设置 CUDA_HOME\n\n   ```\n   nvcc: NVIDIA (R) Cuda compiler driver\n   Copyright (c) 2005-2020 NVIDIA Corporation\n   Built on Mon_Nov_30_19:08:53_PST_2020\n   Cuda compilation tools, release 11.2, V11.2.67\n   Build cuda_11.2.r11.2/compiler.29373293_0\n   ```\n\n   :::{note}\n   如果想要支持 ROCm，可以参考 [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 安装 ROCm。\n   :::\n\n4. 检查 gcc 的版本（要求大于等于**5.4**）\n\n   ```bash\n   gcc --version\n   ```\n\n5. 开始编译（预估耗时 10 分钟）\n\n   ```bash\n   pip install -e . -v\n   ```\n\n6. 验证安装\n\n   ```bash\n   python .dev_scripts/check_installation.py\n   ```\n\n   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.html)是否已经有解决方案。\n\n   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。\n\n#### 在 macOS 上编译 mmcv\n\n| TODO: 视频教程\n\n```{note}\n如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本，否则会遇到 [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218) 中的问题。\n```\n\n1. 克隆代码仓库\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. 安装 `ninja` 和 `psutil` 以加快编译速度\n\n   ```bash\n   pip install -r requirements/optional.txt\n   ```\n\n3. 开始编译\n\n   ```bash\n   pip install -e .\n   ```\n\n4. 验证安装\n\n   ```bash\n   python .dev_scripts/check_installation.py\n   ```\n\n   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。\n\n   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。\n\n#### 在 Windows 上编译 mmcv\n\n| TODO: 视频教程\n\n在 Windows 上编译 mmcv 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 mmcv。\n\n##### 依赖项\n\n请先安装以下的依赖项：\n\n- [Git](https://git-scm.com/download/win)：安装期间，请选择 **add git to Path**\n- [Visual Studio Community 2019](https://visualstudio.microsoft.com)：用于编译 C++ 和 CUDA 代码\n- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)：包管理工具\n- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)：如果只需要 CPU 版本可以不安装 CUDA，安装 CUDA 时，可根据需要进行自定义安装。如果已经安装新版本的显卡驱动，建议取消驱动程序的安装\n\n```{note}\n如果不清楚如何安装以上依赖，请参考[Windows 环境从零安装 mmcv](https://zhuanlan.zhihu.com/p/434491590)。\n另外，你需要知道如何在 Windows 上设置变量环境，尤其是 \"PATH\" 的设置，以下安装过程都会用到。\n```\n\n##### 通用步骤\n\n1. 从 Windows 菜单启动 Anaconda 命令行\n\n   如 Miniconda 安装程序建议，不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本，一个基于 PowerShell，一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell\n\n2. 创建一个新的 Conda 环境\n\n   ```powershell\n   (base) PS C:\\Users\\xxx> conda create --name mmcv python=3.7\n   (base) PS C:\\Users\\xxx> conda activate mmcv  # 确保做任何操作前先激活环境\n   ```\n\n3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本\n\n   ```powershell\n   # CUDA version\n   (mmcv) PS C:\\Users\\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch\n   # CPU version\n   (mmcv) PS C:\\Users\\xxx> conda install install pytorch torchvision cpuonly -c pytorch\n   ```\n\n4. 克隆代码仓库\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx> git clone https://github.com/open-mmlab/mmcv.git\n   (mmcv) PS C:\\Users\\xxx> cd mmcv\n   ```\n\n5. 安装 `ninja` 和 `psutil` 以加快编译速度\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> pip install -r requirements/optional.txt\n   ```\n\n6. 设置 MSVC 编译器\n\n   设置环境变量。添加 `C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.27.29110\\bin\\Hostx86\\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> cl\n   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64\n   Copyright (C) Microsoft Corporation.   All rights reserved.\n\n   usage: cl [ option... ] filename... [ / link linkoption... ]\n   ```\n\n   为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\\x64` 。\n\n   因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。\n\n##### 编译与安装 mmcv\n\nmmcv 有两个版本：\n\n- 只包含 CPU 算子的版本\n\n  编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行\n\n- 既包含 CPU 算子，又包含 CUDA 算子的版本\n\n  同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU\n\n###### CPU 版本\n\n编译安装\n\n```powershell\n(mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子\n(mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py develop  # 安装\n```\n\n###### GPU 版本\n\n1. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> ls env:\n\n   Name                           Value\n   ----                           -----\n   CUDA_PATH                      C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\n   CUDA_PATH_V10_1                C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1\n   CUDA_PATH_V10_2                C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\n   ```\n\n   如果没有，你可以按照下面的步骤设置\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:CUDA_HOME = \"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\"\n   # 或者\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中\n   ```\n\n2. 设置 CUDA 的目标架构\n\n   ```powershell\n   # 这里需要改成你的显卡对应的目标架构\n   (mmcv) PS C:\\Users\\xxx\\mmcv> $env:TORCH_CUDA_ARCH_LIST=\"7.5\"\n   ```\n\n   :::{note}\n   可以点击 [cuda-gpus](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力，也可以通过 CUDA 目录下的 deviceQuery.exe 工具查看\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> &\"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\extras\\demo_suite\\deviceQuery.exe\"\n   Device 0: \"NVIDIA GeForce GTX 1660 SUPER\"\n   CUDA Driver Version / Runtime Version          11.7 / 11.1\n   CUDA Capability Major/Minor version number:    7.5\n   ```\n\n   上面的 7.5 表示目标架构。注意：需把上面命令的 v10.2 换成你的 CUDA 版本。\n   :::\n\n3. 编译安装\n\n   ```powershell\n   (mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子\n   (mmcv) PS C:\\Users\\xxx\\mmcv> python setup.py develop # 安装\n   ```\n\n   ```{note}\n   如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，你可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改本地环境的 PyTorch 源代码\n   ```\n\n##### 验证安装\n\n```powershell\n(mmcv) PS C:\\Users\\xxx\\mmcv> python .dev_scripts/check_installation.py\n```\n\n如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。\n如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。\n\n### 编译 mmcv-lite\n\n如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。\n\n1. 克隆代码仓库\n\n   ```bash\n   git clone https://github.com/open-mmlab/mmcv.git\n   cd mmcv\n   ```\n\n2. 开始编译\n\n   ```bash\n   MMCV_WITH_OPS=0 pip install -e . -v\n   ```\n\n3. 验证安装\n\n   ```bash\n   python -c 'import mmcv;print(mmcv.__version__)'\n   ```\n\n### 在寒武纪 MLU 机器编译 mmcv-full\n\n#### 安装 torch_mlu\n\n##### 选项1: 基于寒武纪 docker image 安装\n\n首先请下载并且拉取寒武纪 docker (请向 service@cambricon.com 发邮件以获得最新的寒武纪 pytorch 发布 docker)。\n\n```\ndocker pull ${docker image}\n```\n\n进入 docker, [编译 MMCV MLU](#编译mmcv-mlu) 并[进行验证](#验证是否成功安装)。\n\n##### 选项2：基于 cambricon pytorch 源码编译安装\n\n请向 service@cambricon.com 发送邮件或联系 Cambricon 工程师以获取合适版本的 CATCH 软件包，在您获得合适版本的 CATCH 软件包后，请参照 ${CATCH-path}/CONTRIBUTING.md 中的步骤安装 CATCH。\n\n#### 编译 MMCV\n\n克隆代码仓库\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git\n```\n\n算子库 mlu-ops 在编译 MMCV 时自动下载到默认路径(mmcv/mlu-ops)，你也可以在编译前设置环境变量 MMCV_MLU_OPS_PATH 指向已经存在的 mlu-ops 算子库路径。\n\n```bash\nexport MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops\n```\n\n开始编译\n\n```bash\ncd mmcv\nexport MMCV_WITH_OPS=1\nexport FORCE_MLU=1\npython setup.py install\n```\n\n#### 验证是否成功安装\n\n完成上述安装步骤之后，您可以尝试运行下面的 Python 代码以测试您是否成功在 MLU 设备上安装了 mmcv-full\n\n```python\nimport torch\nimport torch_mlu\nfrom mmcv.ops import sigmoid_focal_loss\nx = torch.randn(3, 10).mlu()\nx.requires_grad = True\ny = torch.tensor([1, 5, 3]).mlu()\nw = torch.ones(10).float().mlu()\noutput = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')\n```\n\n### 在昇腾 NPU 机器编译 mmcv\n\n在编译 mmcv 前，需要安装 torch_npu，完整安装教程详见 [PyTorch 安装指南](https://gitee.com/ascend/pytorch/blob/master/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md#pytorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97)\n\n#### 选项 1: 使用 NPU 设备源码编译安装 mmcv (推荐方式)\n\n- 拉取 [MMCV 源码](https://github.com/open-mmlab/mmcv.git)\n\n```bash\ngit pull https://github.com/open-mmlab/mmcv.git\n```\n\n- 编译\n\n```bash\nMMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext\n```\n\n- 安装\n\n```bash\nMMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop\n```\n\n#### 选项 2: 使用 pip 安装 Ascend 编译版本的 mmcv\n\nAscend 编译版本的 mmcv 在 mmcv >= 1.7.0 时已经支持直接 pip 安装\n\n```bash\npip install mmcv -f https://download.openmmlab.com/mmcv/dist/ascend/torch1.8.0/index.html\n```\n\n#### 验证\n\n```python\nimport torch\nimport torch_npu\nfrom mmcv.ops import softmax_focal_loss\n\n# Init tensor to the NPU\nx = torch.randn(3, 10).npu()\ny = torch.tensor([1, 5, 3]).npu()\nw = torch.ones(10).float().npu()\n\noutput = softmax_focal_loss(x, y, 2.0, 0.25, w, 'none')\nprint(output)\n```\n"
  },
  {
    "path": "docs/zh_cn/get_started/installation.md",
    "content": "## 安装 MMCV\n\nMMCV 有两个版本：\n\n- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CPU 和 CUDA 算子。注意，完整版本可能需要更长时间来编译。\n- **mmcv-lite**: 精简版，不包含 CPU 和 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用算子的话，精简版可以作为一个考虑选项。\n\n```{warning}\n请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。\n```\n\n### 安装 mmcv\n\n在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证\n\n```bash\npython -c 'import torch;print(torch.__version__)'\n```\n\n如果输出版本信息，则表示 PyTorch 已安装。\n\n#### 使用 mim 安装（推荐）\n\n[mim](https://github.com/open-mmlab/mim) 是 OpenMMLab 项目的包管理工具，使用它可以很方便地安装 mmcv。\n\n```bash\npip install -U openmim\nmim install mmcv\n```\n\n如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](build.md)。\n\n<details>\n<summary>使用预编译包的安装日志</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv<br />\n<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>\n\n</details>\n\n<details>\n<summary>使用源码包的安装日志</summary>\n\nLooking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />\nCollecting mmcv==2.0.0<br />\n<b>Downloading mmcv-2.0.0.tar.gz</b>\n\n</details>\n\n如需安装指定版本的 mmcv，例如安装 2.0.0 版本的 mmcv，可使用以下命令\n\n```bash\nmim install mmcv==2.0.0\n```\n\n:::{note}\n如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。\n\n另外，如果安装依赖库的时间过长，可以指定 pypi 源\n\n```bash\nmim install \"mmcv>=2.0.0rc1\" -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n\n:::\n\n安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。\n\n#### 使用 pip 安装\n\n使用以下命令查看 CUDA 和 PyTorch 的版本\n\n```bash\npython -c 'import torch;print(torch.__version__);print(torch.version.cuda)'\n```\n\n根据系统的类型、CUDA 版本、PyTorch 版本以及 MMCV 版本选择相应的安装命令\n\n<html>\n<body>\n<style>\n    select {\n        /*z-index: 1000;*/\n        position: absolute;\n        top: 10px;\n        width: 6.7rem;\n    }\n    #select-container {\n        position: relative;\n        height: 30px;\n    }\n    #select-cmd {\n        background-color: #f5f6f7;\n        font-size: 14px;\n        margin-top: 20px;\n    }\n    /* 让每一个都间隔1.3rem */\n    #select-os {\n        /* left: 1.375rem; */\n        left: 0;\n    }\n    #select-cuda {\n        /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */\n        left: 8rem;\n    }\n    #select-torch {\n        /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */\n        left: 16rem;\n    }\n    #select-mmcv {\n        /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */\n        left: 24rem;\n    }\n</style>\n<div id=\"select-container\">\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.id))\"\n            onchange=\"changeOS(this.value)\"\n            id=\"select-os\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeCUDA(this.value)\"\n            id=\"select-cuda\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeTorch(this.value)\"\n            id=\"select-torch\">\n    </select>\n    <select\n            size=\"1\"\n            onmousedown=\"handleSelectMouseDown(this.id)\"\n            onclick=\"clickOutside(this, () => handleSelectBlur(this.is))\"\n            onchange=\"changeMMCV(this.value)\"\n            id=\"select-mmcv\">\n    </select>\n</div>\n<pre id=\"select-cmd\"></pre>\n</body>\n<script>\n    // 各个select当前的值\n    let osVal, cudaVal, torchVal, mmcvVal;\n    function clickOutside(targetDom, handler) {\n        const clickHandler = (e) => {\n            if (!targetDom || targetDom.contains(e.target)) return;\n            handler?.();\n            document.removeEventListener('click', clickHandler, false);\n        };\n        document.addEventListener('click', clickHandler, false);\n    }\n    function changeMMCV(val) {\n        mmcvVal = val;\n        change(\"select-mmcv\");\n    }\n    function changeTorch(val) {\n        torchVal = val;\n        change(\"select-torch\");\n    }\n    function changeCUDA(val) {\n        cudaVal = val;\n        change(\"select-cuda\");\n    }\n    function changeOS(val) {\n        osVal = val;\n        change(\"select-os\");\n    }\n    // 控制size大小相关的几个方法\n    function handleSelectMouseDown(id) {\n        const dom = document.getElementById(id);\n        if (!dom) return;\n        const len = dom?.options?.length;\n        if (len >= 10) {\n            dom.size = 10;\n            dom.style.zIndex = 100;\n        }\n    }\n    function handleSelectClick() {\n        const selects = Array.from(document.getElementsByTagName(\"select\"));\n        selects.forEach(select => {\n            select.size = 1;\n        });\n    }\n    function handleSelectBlur(id) {\n        const dom = document.getElementById(id);\n        if (!dom) {\n            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1\n            handleSelectClick();\n            return;\n        }\n        dom.size = 1;\n        dom.style.zIndex = 1;\n    }\n    function changeCmd() {\n        const cmd = document.getElementById(\"select-cmd\");\n        let cmdString = \"pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html\";\n        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html\n        let cudaVersion;\n        if (cudaVal === \"cpu\" || cudaVal === \"mps\") {\n            cudaVersion = \"cpu\";\n        } else {\n            cudaVersion = `cu${cudaVal.split(\".\").join(\"\")}`;\n        }\n        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;\n        cmdString = cmdString.replace(\"{cu_version}\", cudaVersion).replace(\"{mmcv_version}\", mmcvVal).replace(\"{torch_version}\", torchVersion);\n        cmd.textContent = cmdString;\n    }\n    // string数组去重\n    function unique(arr) {\n        if (!arr || !Array.isArray(arr)) return [];\n        return [...new Set(arr)];\n    }\n    // 根据string数组生成option的DocumentFragment\n    function genOptionFragment(data, id) {\n        const name = id.includes(\"-\")? id.split(\"-\")[1] : id;\n        const fragment = new DocumentFragment();\n        data.forEach(option => {\n            const ele = document.createElement(\"option\");\n            let text = `${name} ${option}`;\n            if (name === \"os\" || option.toUpperCase() === \"CPU\" || option.toUpperCase() === \"MPS\") {\n                text = `${option}`;\n            }\n            ele.textContent = text;\n            // 添加value属性，方便下拉框选择时直接读到数据\n            ele.value = option;\n            // 添加点击事件监听\n            ele.addEventListener('click', handleSelectClick);\n            fragment.appendChild(ele);\n        });\n        return fragment;\n    }\n    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内\n    function findAndAppend(data, id) {\n        const fragment = genOptionFragment(data, id);\n        const dom = document.getElementById(id);\n        if (dom) dom.replaceChildren(fragment);\n    }\n    /**\n     * change方法的重点在于\n     * 1. 各个下拉框数据的联动\n     *      OS ==> cuda ==> torch ==> mmcv\n     * 2. 命令行的修改\n     */\n    function change(id) {\n        const order = [\"select-mmcv\", \"select-torch\", \"select-cuda\", \"select-os\"];\n        const idx = order.indexOf(id);\n        if (idx === -1) return;\n        const versionDetail = version[osVal];\n        if (idx >= 3) {\n            // 根据os修改cuda\n            let cuda = [];\n            versionDetail.forEach(v => {\n                cuda.push(v.cuda);\n            });\n            cuda = unique(cuda);\n            cudaVal = cuda[0];\n            findAndAppend(cuda, \"select-cuda\");\n        }\n        if (idx >= 2) {\n            // 根据cuda修改torch\n            const torch = [];\n            versionDetail.forEach(v => {\n                if (v.cuda === cudaVal) torch.push(v.torch);\n            });\n            torchVal = torch[0];\n            findAndAppend(torch, \"select-torch\");\n        }\n        if (idx >= 1) {\n            // 根据torch修改mmcv\n            let mmcv = [];\n            versionDetail.forEach(v => {\n                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;\n            });\n            mmcvVal = mmcv[0];\n            findAndAppend(mmcv, \"select-mmcv\");\n        }\n        changeCmd();\n    }\n    // 初始化，处理version数据，并调用findAndAppend\n    function init() {\n        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底\n        // document.addEventListener(\"click\", handleSelectBlur);\n        const version = window.version;\n        // OS\n        const os = Object.keys(version);\n        osVal = os[0];\n        findAndAppend(os, \"select-os\");\n        change(\"select-os\");\n        changeCmd();\n    }\n    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题\n    window.onload = function () {\n        const url = \"../_static/version.json\"\n        // 申明一个XMLHttpRequest\n        const request = new XMLHttpRequest();\n        // 设置请求方法与路径\n        request.open(\"get\", url);\n        // 不发送数据到服务器\n        request.send(null);\n        //XHR对象获取到返回信息后执行\n        request.onload = function () {\n            // 返回状态为200，即为数据获取成功\n            if (request.status !== 200) return;\n            const data = JSON.parse(request.responseText);\n            window.version = data;\n            init();\n        }\n    }\n</script>\n</html>\n\n如果在上面的下拉框中没有找到对应的版本，则可能是没有对应 PyTorch 或者 CUDA 或者 mmcv 版本的预编译包，此时，你可以[源码安装 mmcv](build.md)。\n\n:::{note}\nPyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv 只提供 1.x.0 的编译包。如果你\n的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv。例如，如果你的\nPyTorch 版本是 1.8.1，你可以放心选择 1.8.x。\n:::\n\n:::{note}\n如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。\n\n另外，如果安装依赖库的时间过长，可以指定 pypi 源\n\n```bash\npip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -i https://pypi.tuna.tsinghua.edu.cn/simple\n```\n\n:::\n\n安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。\n\n#### 使用 docker 镜像\n\n先将算法库克隆到本地再构建镜像\n\n```bash\ngit clone https://github.com/open-mmlab/mmcv.git && cd mmcv\ndocker build -t mmcv -f docker/release/Dockerfile .\n```\n\n也可以直接使用下面的命令构建镜像\n\n```bash\ndocker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release\n```\n\n[Dockerfile](release/Dockerfile) 默认安装最新的 mmcv，如果你想要指定版本，可以使用下面的命令\n\n```bash\ndocker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .\n```\n\n如果你想要使用其他版本的 PyTorch 和 CUDA，你可以在构建镜像时指定它们的版本。\n\n例如指定 PyTorch 的版本是 1.11，CUDA 的版本是 11.3\n\n```bash\ndocker build -t mmcv -f docker/release/Dockerfile \\\n    --build-arg PYTORCH=1.11.0 \\\n    --build-arg CUDA=11.3 \\\n    --build-arg CUDNN=8 \\\n    --build-arg MMCV=2.0.0 .\n```\n\n更多 PyTorch 和 CUDA 镜像可以点击 [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags) 查看。\n\n### 安装 mmcv-lite\n\n如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。\n\n```python\npip install mmcv-lite\n```\n"
  },
  {
    "path": "docs/zh_cn/get_started/introduction.md",
    "content": "## 介绍 MMCV\n\nMMCV 是一个面向计算机视觉的基础库，它提供了以下功能：\n\n- [图像和视频处理](../understand_mmcv/data_process.md)\n- [图像和标注结果可视化](../understand_mmcv/visualization.md)\n- [图像变换](../understand_mmcv/data_transform.md)\n- [多种 CNN 网络结构](../understand_mmcv/cnn.md)\n- [高质量实现的常见 CUDA 算子](../understand_mmcv/ops.md)\n\nMMCV 支持多种平台，包括：\n\n- Linux\n- Windows\n- macOS\n\n它支持的 OpenMMLab 项目：\n\n- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱\n- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱\n- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台\n- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准\n- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准\n- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱\n- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱\n- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱\n- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准\n- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准\n- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准\n- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准\n- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱\n- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台\n- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准\n- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱\n- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱\n- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架\n"
  },
  {
    "path": "docs/zh_cn/get_started/previous_versions.md",
    "content": "## 其他版本的 PyTorch\n\n我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包，但为了您的方便，您可以在下面找到它们。\n\n### PyTorch 1.4\n\n| 1.0.0 \\<= mmcv_version \\<= 1.2.1\n\n#### CUDA 10.1\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html\n```\n\n#### CUDA 9.2\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html\n```\n\n#### CPU\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html\n```\n\n### PyTorch v1.3\n\n| 1.0.0 \\<= mmcv_version \\<= 1.3.16\n\n#### CUDA 10.1\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html\n```\n\n#### CUDA 9.2\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html\n```\n\n#### CPU\n\n```bash\npip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html\n```\n"
  },
  {
    "path": "docs/zh_cn/index.rst",
    "content": "欢迎来到 MMCV 的中文文档！\n=============================\n\n您可以在页面左下角切换中英文文档。\n\n.. toctree::\n   :maxdepth: 2\n   :caption: 介绍与安装\n\n   get_started/introduction.md\n   get_started/installation.md\n   get_started/build.md\n   get_started/article.md\n   get_started/api_reference.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: 深入理解 MMCV\n\n   understand_mmcv/data_process.md\n   understand_mmcv/data_transform.md\n   understand_mmcv/visualization.md\n   understand_mmcv/cnn.md\n   understand_mmcv/ops.md\n\n.. toctree::\n   :caption: 语言切换\n\n   switch_language.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: 兼容性\n\n   compatibility.md\n\n.. toctree::\n\n   faq.md\n\n.. toctree::\n   :maxdepth: 2\n   :caption: 社区\n\n   community/contributing.md\n   community/pr.md\n   community/code_style.md\n\n.. toctree::\n   :maxdepth: 1\n   :caption: API 文档\n\n   mmcv.image <api/image>\n   mmcv.video <api/video>\n   mmcv.visualization <api/visualization>\n   mmcv.cnn <api/cnn>\n   mmcv.ops <api/ops>\n   mmcv.transforms <api/transforms>\n   mmcv.arraymisc <api/arraymisc>\n   mmcv.utils <api/utils>\n\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`search`\n"
  },
  {
    "path": "docs/zh_cn/make.bat",
    "content": "@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset SOURCEDIR=.\nset BUILDDIR=_build\n\nif \"%1\" == \"\" goto help\n\n%SPHINXBUILD% >NUL 2>NUL\nif errorlevel 9009 (\n\techo.\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\n\techo.installed, then set the SPHINXBUILD environment variable to point\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\n\techo.may add the Sphinx directory to PATH.\n\techo.\n\techo.If you don't have Sphinx installed, grab it from\n\techo.http://sphinx-doc.org/\n\texit /b 1\n)\n\n%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%\ngoto end\n\n:help\n%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%\n\n:end\npopd\n"
  },
  {
    "path": "docs/zh_cn/switch_language.md",
    "content": "## <a href='https://mmcv.readthedocs.io/en/2.x/'>English</a>\n\n## <a href='https://mmcv.readthedocs.io/zh_CN/2.x/'>简体中文</a>\n"
  },
  {
    "path": "docs/zh_cn/understand_mmcv/cnn.md",
    "content": "## 卷积神经网络\n\n我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。\n\n### 网络层的构建\n\n在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。\n\n#### 用法\n\n一个简单的例子：\n\n```python\nfrom mmcv.cnn import build_conv_layer\n\ncfg = dict(type='Conv3d')\nlayer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)\n```\n\n- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）\n- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）\n- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU\n- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle\n- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate\n\n#### 拓展\n\n我们还允许自定义层和算子来扩展构建方法。\n\n1. 编写和注册自己的模块：\n\n   ```python\n   from mmengine.registry import MODELS\n\n   @MODELS.register_module()\n   class MyUpsample:\n\n       def __init__(self, scale_factor):\n           pass\n\n       def forward(self, x):\n           pass\n   ```\n\n2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：\n\n   ```python\n   from mmcv.cnn import build_upsample_layer\n\n   cfg = dict(type='MyUpsample', scale_factor=2)\n   layer = build_upsample_layer(cfg)\n   ```\n\n### 模块组件\n\n我们还提供了常用的模块组件，以方便网络构建。\n卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。\n\n```python\nfrom mmcv.cnn import ConvModule\n\n# conv + bn + relu\nconv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n# conv + gn + relu\nconv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))\n# conv + relu\nconv = ConvModule(3, 8, 2)\n# conv\nconv = ConvModule(3, 8, 2, act_cfg=None)\n# conv + leaky relu\nconv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))\n# bn + conv + relu\nconv = ConvModule(\n    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))\n```\n"
  },
  {
    "path": "docs/zh_cn/understand_mmcv/data_process.md",
    "content": "## 数据处理\n\n### 图像\n\n图像模块提供了一些图像预处理的函数，该模块依赖 `opencv` 。\n\n#### 读取/保存/显示\n\n使用 `imread` 和 `imwrite` 函数可以读取和保存图像。\n\n```python\nimport mmcv\n\nimg = mmcv.imread('test.jpg')\nimg = mmcv.imread('test.jpg', flag='grayscale')\nimg_ = mmcv.imread(img)  # 相当于什么也没做\nmmcv.imwrite(img, 'out.jpg')\n```\n\n从二进制中读取图像\n\n```python\nwith open('test.jpg', 'rb') as f:\n    data = f.read()\nimg = mmcv.imfrombytes(data)\n```\n\n显示图像文件或已读取的图像\n\n```python\nmmcv.imshow('tests/data/color.jpg')\n\nfor i in range(10):\n    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)\n    mmcv.imshow(img, win_name='test image', wait_time=200)\n```\n\n#### 色彩空间转换\n\n支持的转换函数：\n\n- bgr2gray\n- gray2bgr\n- bgr2rgb\n- rgb2bgr\n- bgr2hsv\n- hsv2bgr\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\nimg1 = mmcv.bgr2rgb(img)\nimg2 = mmcv.rgb2gray(img1)\nimg3 = mmcv.bgr2hsv(img)\n```\n\n#### 缩放\n\n有三种缩放图像的方法。所有以 `imresize_*` 开头的函数都有一个 `return_scale` 参数，如果\n该参数为 `False` ，函数的返回值只有调整之后的图像，否则是一个元组 `(resized_img, scale)` 。\n\n```python\n# 缩放图像至给定的尺寸\nmmcv.imresize(img, (1000, 600), return_scale=True)\n\n# 缩放图像至与给定的图像同样的尺寸\nmmcv.imresize_like(img, dst_img, return_scale=False)\n\n# 以一定的比例缩放图像\nmmcv.imrescale(img, 0.5)\n\n# 缩放图像至最长的边不大于1000、最短的边不大于800并且没有改变图像的长宽比\nmmcv.imrescale(img, (1000, 800))\n```\n\n#### 旋转\n\n我们可以使用 `imrotate` 旋转图像一定的角度。旋转的中心需要指定，默认值是原始图像的中心。有\n两种旋转的模式，一种保持图像的尺寸不变，因此旋转后原始图像中的某些部分会被裁剪，另一种是扩大\n图像的尺寸进而保留完整的原始图像。\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# 顺时针旋转图像30度\nimg_ = mmcv.imrotate(img, 30)\n\n# 逆时针旋转图像90度\nimg_ = mmcv.imrotate(img, -90)\n\n# 顺时针旋转图像30度并且缩放图像为原始图像的1.5倍\nimg_ = mmcv.imrotate(img, 30, scale=1.5)\n\n# 以坐标(100, 100)为中心顺时针旋转图像30度\nimg_ = mmcv.imrotate(img, 30, center=(100, 100))\n\n# 顺时针旋转图像30度并扩大图像的尺寸\nimg_ = mmcv.imrotate(img, 30, auto_bound=True)\n```\n\n#### 翻转\n\n我们可以使用 `imflip` 翻转图像。\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# 水平翻转图像\nmmcv.imflip(img)\n\n# 垂直翻转图像\nmmcv.imflip(img, direction='vertical')\n```\n\n#### 裁剪\n\n`imcrop` 可以裁剪图像的一个或多个区域，每个区域用左上角和右下角坐标表示，形如(x1, y1, x2, y2)\n\n```python\nimport mmcv\nimport numpy as np\n\nimg = mmcv.imread('tests/data/color.jpg')\n\n# 裁剪区域 (10, 10, 100, 120)\nbboxes = np.array([10, 10, 100, 120])\npatch = mmcv.imcrop(img, bboxes)\n\n# 裁剪两个区域，分别是 (10, 10, 100, 120) 和 (0, 0, 50, 50)\nbboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])\npatches = mmcv.imcrop(img, bboxes)\n\n# 裁剪两个区域并且缩放区域1.2倍\npatches = mmcv.imcrop(img, bboxes, scale=1.2)\n```\n\n#### 填充\n\n`impad` and `impad_to_multiple` 可以用给定的值将图像填充至给定的尺寸。\n\n```python\nimg = mmcv.imread('tests/data/color.jpg')\n\n# 用给定值将图像填充至 (1000, 1200)\nimg_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)\n\n# 用给定值分别填充图像的3个通道至 (1000, 1200)\nimg_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))\n\n# 用给定值填充图像的左、右、上、下四条边\nimg_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)\n\n# 用3个值分别填充图像的左、右、上、下四条边的3个通道\nimg_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))\n\n# 将图像的四条边填充至能够被给定值整除\nimg_ = mmcv.impad_to_multiple(img, 32)\n```\n\n### 视频\n\n视频模块提供了以下的功能：\n\n- 一个 `VideoReader` 类，具有友好的 API 接口可以读取和转换视频\n- 一些编辑视频的方法，包括 `cut` ， `concat` ， `resize`\n- 光流的读取/保存/变换\n\n#### VideoReader\n\n`VideoReader` 类提供了和序列一样的接口去获取视频帧。该类会缓存所有被访问过的帧。\n\n```python\nvideo = mmcv.VideoReader('test.mp4')\n\n# 获取基本的信息\nprint(len(video))\nprint(video.width, video.height, video.resolution, video.fps)\n\n# 遍历所有的帧\nfor frame in video:\n    print(frame.shape)\n\n# 读取下一帧\nimg = video.read()\n\n# 使用索引获取帧\nimg = video[100]\n\n# 获取指定范围的帧\nimg = video[5:10]\n```\n\n将视频切成帧并保存至给定目录或者从给定目录中生成视频。\n\n```python\n# 将视频切成帧并保存至目录\nvideo = mmcv.VideoReader('test.mp4')\nvideo.cvt2frames('out_dir')\n\n# 从给定目录中生成视频\nmmcv.frames2video('out_dir', 'test.avi')\n```\n\n#### 编辑函数\n\n有几个用于编辑视频的函数，这些函数是对 `ffmpeg` 的封装。\n\n```python\n# 裁剪视频\nmmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')\n\n# 将多个视频拼接成一个视频\nmmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')\n\n# 将视频缩放至给定的尺寸\nmmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))\n\n# 将视频缩放至给定的倍率\nmmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)\n```\n\n#### 光流\n\n`mmcv` 提供了以下用于操作光流的函数：\n\n- 读取/保存\n- 可视化\n- 流变换\n\n我们提供了两种将光流dump到文件的方法，分别是非压缩和压缩的方法。非压缩的方法直接将浮点数值的光流\n保存至二进制文件，虽然光流无损但文件会比较大。而压缩的方法先量化光流至 0-255 整形数值再保存为\njpeg图像。光流的x维度和y维度会被拼接到图像中。\n\n1. 读取/保存\n\n```python\nflow = np.random.rand(800, 600, 2).astype(np.float32)\n# 保存光流到flo文件 (~3.7M)\nmmcv.flowwrite(flow, 'uncompressed.flo')\n# 保存光流为jpeg图像 (~230K)，图像的尺寸为 (800, 1200)\nmmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)\n\n# 读取光流文件，以下两种方式读取的光流尺寸均为 (800, 600, 2)\nflow = mmcv.flowread('uncompressed.flo')\nflow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)\n```\n\n2. 可视化\n\n使用 `mmcv.flowshow()` 可视化光流\n\n```python\nmmcv.flowshow(flow)\n```\n\n![progress](../../en/_static/flow_visualization.png)\n\n1. 流变换\n\n```python\nimg1 = mmcv.imread('img1.jpg')\nflow = mmcv.flowread('flow.flo')\nwarped_img2 = mmcv.flow_warp(img1, flow)\n```\n\nimg1 (左) and img2 (右)\n\n![raw images](../../en/_static/flow_raw_images.png)\n\n光流 (img2 -> img1)\n\n![optical flow](../../en/_static/flow_img2toimg1.png)\n\n变换后的图像和真实图像的差异\n\n![warped image](../../en/_static/flow_warp_diff.png)\n"
  },
  {
    "path": "docs/zh_cn/understand_mmcv/data_transform.md",
    "content": "# 数据变换\n\n在 OpenMMLab 算法库中，数据集的构建和数据的准备是相互解耦的。通常，数据集的构建只对数据集进行解析，记录每个样本的基本信息；而数据的准备则是通过一系列的数据变换，根据样本的基本信息进行数据加载、预处理、格式化等操作。\n\n## 数据变换的设计\n\n在 MMCV 中，我们使用各种可调用的数据变换类来进行数据的操作。这些数据变换类可以接受若干配置参数进行实例化，之后通过调用的方式对输入的数据字典进行处理。同时，我们约定所有数据变换都接受一个字典作为输入，并将处理后的数据输出为一个字典。一个简单的例子如下：\n\n```python\n>>> import numpy as np\n>>> from mmcv.transforms import Resize\n>>>\n>>> transform = Resize(scale=(224, 224))\n>>> data_dict = {'img': np.random.rand(256, 256, 3)}\n>>> data_dict = transform(data_dict)\n>>> print(data_dict['img'].shape)\n(224, 224, 3)\n```\n\n数据变换类会读取输入字典的某些字段，并且可能添加、或者更新某些字段。这些字段的键大部分情况下是固定的，如 `Resize` 会固定地读取输入字典中的 `\"img\"` 等字段。我们可以在对应类的文档中了解对输入输出字段的约定。\n\n```{note}\n默认情况下，在需要图像尺寸作为**初始化参数**的数据变换 (如Resize, Pad) 中，图像尺寸的顺序均为 (width, height)。在数据变换**返回的字典**中，图像相关的尺寸， 如 `img_shape`、`ori_shape`、`pad_shape` 等，均为 (height, width)。\n```\n\nMMCV 为所有的数据变换类提供了一个统一的基类 (`BaseTransform`)：\n\n```python\nclass BaseTransform(metaclass=ABCMeta):\n\n    def __call__(self, results: dict) -> dict:\n\n        return self.transform(results)\n\n    @abstractmethod\n    def transform(self, results: dict) -> dict:\n        pass\n```\n\n所有的数据变换类都需要继承 `BaseTransform`，并实现 `transform` 方法。`transform` 方法的输入和输出均为一个字典。在**自定义数据变换类**一节中，我们会更详细地介绍如何实现一个数据变换类。\n\n## 数据流水线\n\n如上所述，所有数据变换的输入和输出都是一个字典，而且根据 OpenMMLab 中 [有关数据集的约定](TODO)，数据集中每个样本的基本信息都是一个字典。这样一来，我们可以将所有的数据变换操作首尾相接，组合成为一条数据流水线（data pipeline），输入数据集中样本的信息字典，输出完成一系列处理后的信息字典。\n\n以分类任务为例，我们在下图展示了一个典型的数据流水线。对每个样本，数据集中保存的基本信息是一个如图中最左侧所示的字典，之后每经过一个由蓝色块代表的数据变换操作，数据字典中都会加入新的字段（标记为绿色）或更新现有的字段（标记为橙色）。\n\n<div align=center>\n<img src=\"https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png\" width=\"90%\"/>\n</div>\n\n在配置文件中，数据流水线是一个若干数据变换配置字典组成的列表，每个数据集都需要设置参数 `pipeline` 来定义该数据集需要进行的数据准备操作。如上数据流水线在配置文件中的配置如下：\n\n```python\npipeline = [\n    dict(type='LoadImageFromFile'),\n    dict(type='Resize', size=256, keep_ratio=True),\n    dict(type='CenterCrop', crop_size=224),\n    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),\n    dict(type='ClsFormatBundle')\n]\n\ndataset = dict(\n    ...\n    pipeline=pipeline,\n    ...\n)\n```\n\n## 常用的数据变换类\n\n按照功能，常用的数据变换类可以大致分为数据加载、数据预处理与增强、数据格式化。在 MMCV 中，我们提供了一些常用的数据变换类如下：\n\n### 数据加载\n\n为了支持大规模数据集的加载，通常在 `Dataset` 初始化时不加载数据，只加载相应的路径。因此需要在数据流水线中进行具体数据的加载。\n\n|            class            |                   功能                    |\n| :-------------------------: | :---------------------------------------: |\n| [`LoadImageFromFile`](TODO) |             根据路径加载图像              |\n|  [`LoadAnnotations`](TODO)  | 加载和组织标注信息，如 bbox、语义分割图等 |\n\n### 数据预处理及增强\n\n数据预处理和增强通常是对图像本身进行变换，如裁剪、填充、缩放等。\n\n|              class               |                功能                |\n| :------------------------------: | :--------------------------------: |\n|          [`Pad`](TODO)           |            填充图像边缘            |\n|       [`CenterCrop`](TODO)       |              居中裁剪              |\n|       [`Normalize`](TODO)        |          对图像进行归一化          |\n|         [`Resize`](TODO)         |     按照指定尺寸或比例缩放图像     |\n|      [`RandomResize`](TODO)      |    缩放图像至指定范围的随机尺寸    |\n| [`RandomMultiscaleResize`](TODO) | 缩放图像至多个尺寸中的随机一个尺寸 |\n|    [`RandomGrayscale`](TODO)     |             随机灰度化             |\n|       [`RandomFlip`](TODO)       |            图像随机翻转            |\n|   [`MultiScaleFlipAug`](TODO)    |   支持缩放和翻转的测试时数据增强   |\n\n### 数据格式化\n\n数据格式化操作通常是对数据进行的类型转换。\n\n|          class          |               功能                |\n| :---------------------: | :-------------------------------: |\n|   [`ToTensor`](TODO)    | 将指定的数据转换为 `torch.Tensor` |\n| [`ImageToTensor`](TODO) |    将图像转换为 `torch.Tensor`    |\n\n## 自定义数据变换类\n\n要实现一个新的数据变换类，需要继承 `BaseTransform`，并实现 `transform` 方法。这里，我们使用一个简单的翻转变换（`MyFlip`）作为示例：\n\n```python\nimport random\nimport mmcv\nfrom mmcv.transforms import BaseTransform, TRANSFORMS\n\n@TRANSFORMS.register_module()\nclass MyFlip(BaseTransform):\n    def __init__(self, direction: str):\n        super().__init__()\n        self.direction = direction\n\n    def transform(self, results: dict) -> dict:\n        img = results['img']\n        results['img'] = mmcv.imflip(img, direction=self.direction)\n        return results\n```\n\n从而，我们可以实例化一个 `MyFlip` 对象，并将之作为一个可调用对象，来处理我们的数据字典。\n\n```python\nimport numpy as np\n\ntransform = MyFlip(direction='horizontal')\ndata_dict = {'img': np.random.rand(224, 224, 3)}\ndata_dict = transform(data_dict)\nprocessed_img = data_dict['img']\n```\n\n又或者，在配置文件的 pipeline 中使用 `MyFlip` 变换\n\n```python\npipeline = [\n    ...\n    dict(type='MyFlip', direction='horizontal'),\n    ...\n]\n```\n\n需要注意的是，如需在配置文件中使用，需要保证 `MyFlip` 类所在的文件在运行时能够被导入。\n\n## 变换包装\n\n变换包装是一种特殊的数据变换类，他们本身并不操作数据字典中的图像、标签等信息，而是对其中定义的数据变换的行为进行增强。\n\n### 字段映射（KeyMapper）\n\n字段映射包装（`KeyMapper`）用于对数据字典中的字段进行映射。例如，一般的图像处理变换都从数据字典中的 `\"img\"` 字段获得值。但有些时候，我们希望这些变换处理数据字典中其他字段中的图像，比如 `\"gt_img\"` 字段。\n\n如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用字段映射包装：\n\n```python\npipeline = [\n    ...\n    dict(type='KeyMapper',\n        mapping={\n            'img': 'gt_img',  # 将 \"gt_img\" 字段映射至 \"img\" 字段\n            'mask': ...,  # 不使用原始数据中的 \"mask\" 字段。即对于被包装的数据变换，数据中不包含 \"mask\" 字段\n        },\n        auto_remap=True,  # 在完成变换后，将 \"img\" 重映射回 \"gt_img\" 字段\n        transforms=[\n            # 在 `RandomFlip` 变换类中，我们只需要操作 \"img\" 字段即可\n            dict(type='RandomFlip'),\n        ])\n    ...\n]\n```\n\n利用字段映射包装，我们在实现数据变换类时，不需要考虑在 `transform` 方法中考虑各种可能的输入字段名，只需要处理默认的字段即可。\n\n### 随机选择（RandomChoice）和随机执行（RandomApply）\n\n随机选择包装（`RandomChoice`）用于从一系列数据变换组合中随机应用一个数据变换组合。利用这一包装，我们可以简单地实现一些数据增强功能，比如 AutoAugment。\n\n如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用随机选择包装：\n\n```python\npipeline = [\n    ...\n    dict(type='RandomChoice',\n        transforms=[\n            [\n                dict(type='Posterize', bits=4),\n                dict(type='Rotate', angle=30.)\n            ],  # 第一种随机变化组合\n            [\n                dict(type='Equalize'),\n                dict(type='Rotate', angle=30)\n            ],  # 第二种随机变换组合\n        ],\n        prob=[0.4, 0.6]  # 两种随机变换组合各自的选用概率\n        )\n    ...\n]\n```\n\n随机执行包装（`RandomApply`）用于以指定概率随机执行数据变换组合。例如：\n\n```python\npipeline = [\n    ...\n    dict(type='RandomApply',\n        transforms=[dict(type='Rotate', angle=30.)],\n        prob=0.3)  # 以 0.3 的概率执行被包装的数据变换\n    ...\n]\n```\n\n### 多目标扩展（TransformBroadcaster）\n\n通常，一个数据变换类只会从一个固定的字段读取操作目标。虽然我们也可以使用 `KeyMapper` 来改变读取的字段，但无法将变换一次性应用于多个字段的数据。为了实现这一功能，我们需要借助多目标扩展包装（`TransformBroadcaster`）。\n\n多目标扩展包装（`TransformBroadcaster`）有两个用法，一是将数据变换作用于指定的多个字段，二是将数据变换作用于某个字段下的一组目标中。\n\n1. 应用于多个字段\n\n   假设我们需要将数据变换应用于 `\"lq\"` (low-quality) 和 `\"gt\"` (ground-truth) 两个字段中的图像上。\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           # 分别应用于 \"lq\" 和 \"gt\" 两个字段，并将二者应设置 \"img\" 字段\n           mapping={'img': ['lq', 'gt']},\n           # 在完成变换后，将 \"img\" 字段重映射回原先的字段\n           auto_remap=True,\n           # 是否在对各目标的变换中共享随机变量\n           # 更多介绍参加后续章节（随机变量共享）\n           share_random_params=True,\n           transforms=[\n               # 在 `RandomFlip` 变换类中，我们只需要操作 \"img\" 字段即可\n               dict(type='RandomFlip'),\n           ])\n   ]\n   ```\n\n   在多目标扩展的 `mapping` 设置中，我们同样可以使用 `...` 来忽略指定的原始字段。如以下例子中，被包裹的 `RandomCrop` 会对字段 `\"img\"` 中的图像进行裁剪，并且在字段 `\"img_shape\"` 存在时更新剪裁后的图像大小。如果我们希望同时对两个图像字段 `\"lq\"` 和 `\"gt\"` 进行相同的随机裁剪，但只更新一次 `\"img_shape\"` 字段，可以通过例子中的方式实现：\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           mapping={\n               'img': ['lq', 'gt'],\n               'img_shape': ['img_shape', ...],\n            },\n           # 在完成变换后，将 \"img\" 和 \"img_shape\" 字段重映射回原先的字段\n           auto_remap=True,\n           # 是否在对各目标的变换中共享随机变量\n           # 更多介绍参加后续章节（随机变量共享）\n           share_random_params=True,\n           transforms=[\n               # `RandomCrop` 类中会操作 \"img\" 和 \"img_shape\" 字段。若 \"img_shape\" 空缺，\n               # 则只操作 \"img\"\n               dict(type='RandomCrop'),\n           ])\n   ]\n   ```\n\n2. 应用于一个字段的一组目标\n\n   假设我们需要将数据变换应用于 `\"images\"` 字段，该字段为一个图像组成的 list。\n\n   ```python\n   pipeline = [\n       dict(type='TransformBroadcaster',\n           # 将 \"images\" 字段下的每张图片映射至 \"img\" 字段\n           mapping={'img': 'images'},\n           # 在完成变换后，将 \"img\" 字段下的图片重映射回 \"images\" 字段的列表中\n           auto_remap=True,\n           # 是否在对各目标的变换中共享随机变量\n           share_random_params=True,\n           transforms=[\n               # 在 `RandomFlip` 变换类中，我们只需要操作 \"img\" 字段即可\n               dict(type='RandomFlip'),\n           ])\n   ]\n   ```\n\n#### 装饰器 `cache_randomness`\n\n在 `TransformBroadcaster` 中，我们提供了 `share_random_params` 选项来支持在多次数据变换中共享随机状态。例如，在超分辨率任务中，我们希望将随机变换**同步**作用于低分辨率图像和原始图像。如果我们希望在自定义的数据变换类中使用这一功能，需要在类中标注哪些随机变量是支持共享的。这可以通过装饰器 `cache_randomness` 来实现。\n\n以上文中的 `MyFlip` 为例，我们希望以一定的概率随机执行翻转：\n\n```python\nfrom mmcv.transforms.utils import cache_randomness\n\n@TRANSFORMS.register_module()\nclass MyRandomFlip(BaseTransform):\n    def __init__(self, prob: float, direction: str):\n        super().__init__()\n        self.prob = prob\n        self.direction = direction\n\n    @cache_randomness  # 标注该方法的输出为可共享的随机变量\n    def do_flip(self):\n        flip = True if random.random() > self.prob else False\n        return flip\n\n    def transform(self, results: dict) -> dict:\n        img = results['img']\n        if self.do_flip():\n            results['img'] = mmcv.imflip(img, direction=self.direction)\n        return results\n```\n\n在上面的例子中，我们用`cache_randomness` 装饰 `do_flip`方法，即将该方法返回值 `flip` 标注为一个支持共享的随机变量。进而，在 `TransformBroadcaster` 对多个目标的变换中，这一变量的值都会保持一致。\n\n#### 装饰器 `avoid_cache_randomness`\n\n在一些情况下，我们无法将数据变换中产生随机变量的过程单独放在类方法中。例如数据变换中使用的来自第三方库的模块，这些模块将随机变量相关的部分封装在了内部，导致无法将其抽出为数据变换的类方法。这样的数据变换无法通过装饰器 `cache_randomness` 标注支持共享的随机变量，进而无法在多目标扩展时共享随机变量。\n\n为了避免在多目标扩展中误用此类数据变换，我们提供了另一个装饰器 `avoid_cache_randomness`，用来对此类数据变换进行标记：\n\n```python\nfrom mmcv.transforms.utils import avoid_cache_randomness\n\n@TRANSFORMS.register_module()\n@avoid_cache_randomness\nclass MyRandomTransform(BaseTransform):\n\n    def transform(self, results: dict) -> dict:\n        ...\n```\n\n用 `avoid_cache_randomness` 标记的数据变换类，当其实例被 `TransformBroadcaster` 包装且将参数 `share_random_params` 设置为 True 时，会抛出异常，以此提醒用户不能这样使用。\n\n在使用 `avoid_cache_randomness` 时需要注意以下几点：\n\n1. `avoid_cache_randomness` 只用于装饰数据变换类（BaseTransfrom 的子类），而不能用与装饰其他一般的类、类方法或函数\n2. 被 `avoid_cache_randomness` 修饰的数据变换作为基类时，其子类将**不会继承**这一特性。如果子类仍无法共享随机变量，则应再次使用 `avoid_cache_randomness` 修饰\n3. 只有当一个数据变换具有随机性，且无法共享随机参数时，才需要以 `avoid_cache_randomness` 修饰。无随机性的数据变换不需要修饰\n"
  },
  {
    "path": "docs/zh_cn/understand_mmcv/ops.md",
    "content": "## 算子\n\nMMCV 提供了检测、分割等任务中常用的算子\n\n| Device                       | CPU | CUDA | MLU | MPS | Ascend |\n| ---------------------------- | --- | ---- | --- | --- | ------ |\n| ActiveRotatedFilter          | √   | √    |     |     | √      |\n| AssignScoreWithK             |     | √    |     |     |        |\n| BallQuery                    |     | √    | √   |     | √      |\n| BBoxOverlaps                 |     | √    | √   | √   | √      |\n| BorderAlign                  |     | √    |     |     |        |\n| BoxIouRotated                | √   | √    | √   |     | √      |\n| BoxIouQuadri                 | √   | √    |     |     |        |\n| CARAFE                       |     | √    | √   |     |        |\n| ChamferDistance              |     | √    |     |     | √      |\n| CrissCrossAttention          |     | √    |     |     |        |\n| ContourExpand                | √   |      |     |     |        |\n| ConvexIoU                    |     | √    |     |     |        |\n| CornerPool                   |     | √    |     |     |        |\n| Correlation                  |     | √    |     |     |        |\n| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |\n| Deformable RoIPool           |     | √    | √   |     | √      |\n| DiffIoURotated               |     | √    | √   |     |        |\n| DynamicScatter               |     | √    | √   |     |        |\n| FurthestPointSample          |     | √    |     |     | √      |\n| FurthestPointSampleWithDist  |     | √    |     |     | √      |\n| FusedBiasLeakyrelu           |     | √    |     |     | √      |\n| GatherPoints                 |     | √    |     |     | √      |\n| GroupPoints                  |     | √    |     |     |        |\n| Iou3d                        |     | √    | √   |     |        |\n| KNN                          |     | √    |     |     |        |\n| MaskedConv                   |     | √    | √   |     | √      |\n| MergeCells                   |     | √    |     |     |        |\n| MinAreaPolygon               |     | √    |     |     |        |\n| ModulatedDeformConv2d        | √   | √    | √   |     | √      |\n| MultiScaleDeformableAttn     |     | √    | √   |     | √      |\n| NMS                          | √   | √    | √   |     | √      |\n| NMSRotated                   | √   | √    | √   |     | √      |\n| NMSQuadri                    | √   | √    |     |     |        |\n| PixelGroup                   | √   |      |     |     |        |\n| PointsInBoxes                | √   | √    |     |     |        |\n| PointsInPolygons             |     | √    |     |     |        |\n| PSAMask                      | √   | √    | √   |     | √      |\n| RotatedFeatureAlign          | √   | √    | √   |     | √      |\n| RoIPointPool3d               |     | √    | √   |     |        |\n| RoIPool                      |     | √    | √   |     | √      |\n| RoIAlignRotated              | √   | √    | √   |     | √      |\n| RiRoIAlignRotated            |     | √    |     |     |        |\n| RoIAlign                     | √   | √    | √   |     | √      |\n| RoIAwarePool3d               |     | √    | √   |     |        |\n| SAConv2d                     |     | √    |     |     |        |\n| SigmoidFocalLoss             |     | √    | √   |     | √      |\n| SoftmaxFocalLoss             |     | √    |     |     | √      |\n| SoftNMS                      |     | √    |     |     |        |\n| Sparse Convolution           |     | √    | √   |     |        |\n| Synchronized BatchNorm       |     | √    |     |     |        |\n| ThreeInterpolate             |     | √    |     |     |        |\n| ThreeNN                      |     | √    | √   |     |        |\n| TINShift                     |     | √    | √   |     |        |\n| UpFirDn2d                    |     | √    |     |     |        |\n| Voxelization                 | √   | √    | √   |     | √      |\n| PrRoIPool                    |     | √    |     |     |        |\n| BezierAlign                  | √   | √    |     |     |        |\n| BiasAct                      |     | √    |     |     |        |\n| FilteredLrelu                |     | √    |     |     |        |\n| Conv2dGradfix                |     | √    |     |     |        |\n"
  },
  {
    "path": "docs/zh_cn/understand_mmcv/visualization.md",
    "content": "## 可视化\n\n`mmcv` 可以展示图像以及标注（目前只支持标注框）\n\n```python\n# 展示图像文件\nmmcv.imshow('a.jpg')\n\n# 展示已加载的图像\nimg = np.random.rand(100, 100, 3)\nmmcv.imshow(img)\n\n# 展示带有标注框的图像\nimg = np.random.rand(100, 100, 3)\nbboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])\nmmcv.imshow_bboxes(img, bboxes)\n```\n\n`mmcv` 也可以展示特殊的图像，例如光流\n\n```python\nflow = mmcv.flowread('test.flo')\nmmcv.flowshow(flow)\n```\n"
  },
  {
    "path": "mmcv/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n# flake8: noqa\nfrom .arraymisc import *\nfrom .image import *\nfrom .transforms import *\nfrom .version import *\nfrom .video import *\nfrom .visualization import *\n\n# The following modules are not imported to this level, so mmcv may be used\n# without PyTorch.\n# - op\n# - utils\n"
  },
  {
    "path": "mmcv/arraymisc/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .quantization import dequantize, quantize\n\n__all__ = ['quantize', 'dequantize']\n"
  },
  {
    "path": "mmcv/arraymisc/quantization.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Union\n\nimport numpy as np\n\n\ndef quantize(arr: np.ndarray,\n             min_val: Union[int, float],\n             max_val: Union[int, float],\n             levels: int,\n             dtype=np.int64) -> tuple:\n    \"\"\"Quantize an array of (-inf, inf) to [0, levels-1].\n\n    Args:\n        arr (ndarray): Input array.\n        min_val (int or float): Minimum value to be clipped.\n        max_val (int or float): Maximum value to be clipped.\n        levels (int): Quantization levels.\n        dtype (np.type): The type of the quantized array.\n\n    Returns:\n        tuple: Quantized array.\n    \"\"\"\n    if not (isinstance(levels, int) and levels > 1):\n        raise ValueError(\n            f'levels must be a positive integer, but got {levels}')\n    if min_val >= max_val:\n        raise ValueError(\n            f'min_val ({min_val}) must be smaller than max_val ({max_val})')\n\n    arr = np.clip(arr, min_val, max_val) - min_val\n    quantized_arr = np.minimum(\n        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)\n\n    return quantized_arr\n\n\ndef dequantize(arr: np.ndarray,\n               min_val: Union[int, float],\n               max_val: Union[int, float],\n               levels: int,\n               dtype=np.float64) -> tuple:\n    \"\"\"Dequantize an array.\n\n    Args:\n        arr (ndarray): Input array.\n        min_val (int or float): Minimum value to be clipped.\n        max_val (int or float): Maximum value to be clipped.\n        levels (int): Quantization levels.\n        dtype (np.type): The type of the dequantized array.\n\n    Returns:\n        tuple: Dequantized array.\n    \"\"\"\n    if not (isinstance(levels, int) and levels > 1):\n        raise ValueError(\n            f'levels must be a positive integer, but got {levels}')\n    if min_val >= max_val:\n        raise ValueError(\n            f'min_val ({min_val}) must be smaller than max_val ({max_val})')\n\n    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -\n                                                   min_val) / levels + min_val\n\n    return dequantized_arr\n"
  },
  {
    "path": "mmcv/cnn/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .alexnet import AlexNet\n# yapf: disable\nfrom .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,\n                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,\n                     DepthwiseSeparableConvModule, GeneralizedAttention,\n                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,\n                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,\n                     build_activation_layer, build_conv_layer,\n                     build_norm_layer, build_padding_layer, build_plugin_layer,\n                     build_upsample_layer, conv_ws_2d, is_norm)\n# yapf: enable\nfrom .resnet import ResNet, make_res_layer\nfrom .rfsearch import Conv2dRFSearchOp, RFSearchHook\nfrom .utils import fuse_conv_bn, get_model_complexity_info\nfrom .vgg import VGG, make_vgg_layer\n\n__all__ = [\n    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',\n    'ConvModule', 'build_activation_layer', 'build_conv_layer',\n    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',\n    'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',\n    'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',\n    'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',\n    'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',\n    'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',\n    'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'\n]\n"
  },
  {
    "path": "mmcv/cnn/alexnet.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport logging\nfrom typing import Optional\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.runner import load_checkpoint\n\n\nclass AlexNet(nn.Module):\n    \"\"\"AlexNet backbone.\n\n    Args:\n        num_classes (int): number of classes for classification.\n    \"\"\"\n\n    def __init__(self, num_classes: int = -1):\n        super().__init__()\n        self.num_classes = num_classes\n        self.features = nn.Sequential(\n            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),\n            nn.ReLU(inplace=True),\n            nn.MaxPool2d(kernel_size=3, stride=2),\n            nn.Conv2d(64, 192, kernel_size=5, padding=2),\n            nn.ReLU(inplace=True),\n            nn.MaxPool2d(kernel_size=3, stride=2),\n            nn.Conv2d(192, 384, kernel_size=3, padding=1),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(384, 256, kernel_size=3, padding=1),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(256, 256, kernel_size=3, padding=1),\n            nn.ReLU(inplace=True),\n            nn.MaxPool2d(kernel_size=3, stride=2),\n        )\n        if self.num_classes > 0:\n            self.classifier = nn.Sequential(\n                nn.Dropout(),\n                nn.Linear(256 * 6 * 6, 4096),\n                nn.ReLU(inplace=True),\n                nn.Dropout(),\n                nn.Linear(4096, 4096),\n                nn.ReLU(inplace=True),\n                nn.Linear(4096, num_classes),\n            )\n\n    def init_weights(self, pretrained: Optional[str] = None) -> None:\n        if isinstance(pretrained, str):\n            logger = logging.getLogger()\n            load_checkpoint(self, pretrained, strict=False, logger=logger)\n        elif pretrained is None:\n            # use default initializer\n            pass\n        else:\n            raise TypeError('pretrained must be a str or None')\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n\n        x = self.features(x)\n        if self.num_classes > 0:\n            x = x.view(x.size(0), 256 * 6 * 6)\n            x = self.classifier(x)\n\n        return x\n"
  },
  {
    "path": "mmcv/cnn/bricks/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .activation import build_activation_layer\nfrom .context_block import ContextBlock\nfrom .conv import build_conv_layer\nfrom .conv2d_adaptive_padding import Conv2dAdaptivePadding\nfrom .conv_module import ConvModule\nfrom .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d\nfrom .depthwise_separable_conv_module import DepthwiseSeparableConvModule\nfrom .drop import Dropout, DropPath\nfrom .generalized_attention import GeneralizedAttention\nfrom .hsigmoid import HSigmoid\nfrom .hswish import HSwish\nfrom .non_local import NonLocal1d, NonLocal2d, NonLocal3d\nfrom .norm import build_norm_layer, is_norm\nfrom .padding import build_padding_layer\nfrom .plugin import build_plugin_layer\nfrom .scale import LayerScale, Scale\nfrom .swish import Swish\nfrom .upsample import build_upsample_layer\nfrom .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,\n                       Linear, MaxPool2d, MaxPool3d)\n\n__all__ = [\n    'ConvModule', 'build_activation_layer', 'build_conv_layer',\n    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',\n    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',\n    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',\n    'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',\n    'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',\n    'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',\n    'Conv3d', 'Dropout', 'DropPath', 'LayerScale'\n]\n"
  },
  {
    "path": "mmcv/cnn/bricks/activation.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Dict\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfor module in [\n        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,\n        nn.Sigmoid, nn.Tanh\n]:\n    MODELS.register_module(module=module)\n\nif digit_version(torch.__version__) >= digit_version('1.7.0'):\n    MODELS.register_module(module=nn.SiLU, name='SiLU')\nelse:\n\n    class SiLU(nn.Module):\n        \"\"\"Sigmoid Weighted Liner Unit.\"\"\"\n\n        def __init__(self, inplace=False):\n            super().__init__()\n            self.inplace = inplace\n\n        def forward(self, inputs) -> torch.Tensor:\n            if self.inplace:\n                return inputs.mul_(torch.sigmoid(inputs))\n            else:\n                return inputs * torch.sigmoid(inputs)\n\n    MODELS.register_module(module=SiLU, name='SiLU')\n\n\n@MODELS.register_module(name='Clip')\n@MODELS.register_module()\nclass Clamp(nn.Module):\n    \"\"\"Clamp activation layer.\n\n    This activation function is to clamp the feature map value within\n    :math:`[min, max]`. More details can be found in ``torch.clamp()``.\n\n    Args:\n        min (Number | optional): Lower-bound of the range to be clamped to.\n            Default to -1.\n        max (Number | optional): Upper-bound of the range to be clamped to.\n            Default to 1.\n    \"\"\"\n\n    def __init__(self, min: float = -1., max: float = 1.):\n        super().__init__()\n        self.min = min\n        self.max = max\n\n    def forward(self, x) -> torch.Tensor:\n        \"\"\"Forward function.\n\n        Args:\n            x (torch.Tensor): The input tensor.\n\n        Returns:\n            torch.Tensor: Clamped tensor.\n        \"\"\"\n        return torch.clamp(x, min=self.min, max=self.max)\n\n\nclass GELU(nn.Module):\n    r\"\"\"Applies the Gaussian Error Linear Units function:\n\n    .. math::\n        \\text{GELU}(x) = x * \\Phi(x)\n    where :math:`\\Phi(x)` is the Cumulative Distribution Function for\n    Gaussian Distribution.\n\n    Shape:\n        - Input: :math:`(N, *)` where `*` means, any number of additional\n          dimensions\n        - Output: :math:`(N, *)`, same shape as the input\n\n    .. image:: scripts/activation_images/GELU.png\n\n    Examples::\n\n        >>> m = nn.GELU()\n        >>> input = torch.randn(2)\n        >>> output = m(input)\n    \"\"\"\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        return F.gelu(input)\n\n\nif (TORCH_VERSION == 'parrots'\n        or digit_version(TORCH_VERSION) < digit_version('1.4')):\n    MODELS.register_module(module=GELU)\nelse:\n    MODELS.register_module(module=nn.GELU)\n\n\ndef build_activation_layer(cfg: Dict) -> nn.Module:\n    \"\"\"Build activation layer.\n\n    Args:\n        cfg (dict): The activation layer config, which should contain:\n\n            - type (str): Layer type.\n            - layer args: Args needed to instantiate an activation layer.\n\n    Returns:\n        nn.Module: Created activation layer.\n    \"\"\"\n    return MODELS.build(cfg)\n"
  },
  {
    "path": "mmcv/cnn/bricks/context_block.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Union\n\nimport torch\nfrom mmengine.model import constant_init, kaiming_init\nfrom mmengine.registry import MODELS\nfrom torch import nn\n\n\ndef last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:\n    if isinstance(m, nn.Sequential):\n        constant_init(m[-1], val=0)\n    else:\n        constant_init(m, val=0)\n\n\n@MODELS.register_module()\nclass ContextBlock(nn.Module):\n    \"\"\"ContextBlock module in GCNet.\n\n    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'\n    (https://arxiv.org/abs/1904.11492) for details.\n\n    Args:\n        in_channels (int): Channels of the input feature map.\n        ratio (float): Ratio of channels of transform bottleneck\n        pooling_type (str): Pooling method for context modeling.\n            Options are 'att' and 'avg', stand for attention pooling and\n            average pooling respectively. Default: 'att'.\n        fusion_types (Sequence[str]): Fusion method for feature fusion,\n            Options are 'channels_add', 'channel_mul', stand for channelwise\n            addition and multiplication respectively. Default: ('channel_add',)\n    \"\"\"\n\n    _abbr_ = 'context_block'\n\n    def __init__(self,\n                 in_channels: int,\n                 ratio: float,\n                 pooling_type: str = 'att',\n                 fusion_types: tuple = ('channel_add', )):\n        super().__init__()\n        assert pooling_type in ['avg', 'att']\n        assert isinstance(fusion_types, (list, tuple))\n        valid_fusion_types = ['channel_add', 'channel_mul']\n        assert all([f in valid_fusion_types for f in fusion_types])\n        assert len(fusion_types) > 0, 'at least one fusion should be used'\n        self.in_channels = in_channels\n        self.ratio = ratio\n        self.planes = int(in_channels * ratio)\n        self.pooling_type = pooling_type\n        self.fusion_types = fusion_types\n        if pooling_type == 'att':\n            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)\n            self.softmax = nn.Softmax(dim=2)\n        else:\n            self.avg_pool = nn.AdaptiveAvgPool2d(1)\n        if 'channel_add' in fusion_types:\n            self.channel_add_conv = nn.Sequential(\n                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),\n                nn.LayerNorm([self.planes, 1, 1]),\n                nn.ReLU(inplace=True),  # yapf: disable\n                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))\n        else:\n            self.channel_add_conv = None\n        if 'channel_mul' in fusion_types:\n            self.channel_mul_conv = nn.Sequential(\n                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),\n                nn.LayerNorm([self.planes, 1, 1]),\n                nn.ReLU(inplace=True),  # yapf: disable\n                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))\n        else:\n            self.channel_mul_conv = None\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        if self.pooling_type == 'att':\n            kaiming_init(self.conv_mask, mode='fan_in')\n            self.conv_mask.inited = True\n\n        if self.channel_add_conv is not None:\n            last_zero_init(self.channel_add_conv)\n        if self.channel_mul_conv is not None:\n            last_zero_init(self.channel_mul_conv)\n\n    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:\n        batch, channel, height, width = x.size()\n        if self.pooling_type == 'att':\n            input_x = x\n            # [N, C, H * W]\n            input_x = input_x.view(batch, channel, height * width)\n            # [N, 1, C, H * W]\n            input_x = input_x.unsqueeze(1)\n            # [N, 1, H, W]\n            context_mask = self.conv_mask(x)\n            # [N, 1, H * W]\n            context_mask = context_mask.view(batch, 1, height * width)\n            # [N, 1, H * W]\n            context_mask = self.softmax(context_mask)\n            # [N, 1, H * W, 1]\n            context_mask = context_mask.unsqueeze(-1)\n            # [N, 1, C, 1]\n            context = torch.matmul(input_x, context_mask)\n            # [N, C, 1, 1]\n            context = context.view(batch, channel, 1, 1)\n        else:\n            # [N, C, 1, 1]\n            context = self.avg_pool(x)\n\n        return context\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # [N, C, 1, 1]\n        context = self.spatial_pool(x)\n\n        out = x\n        if self.channel_mul_conv is not None:\n            # [N, C, 1, 1]\n            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))\n            out = out * channel_mul_term\n        if self.channel_add_conv is not None:\n            # [N, C, 1, 1]\n            channel_add_term = self.channel_add_conv(context)\n            out = out + channel_add_term\n\n        return out\n"
  },
  {
    "path": "mmcv/cnn/bricks/conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nfrom typing import Dict, Optional\n\nfrom mmengine.registry import MODELS\nfrom torch import nn\n\nMODELS.register_module('Conv1d', module=nn.Conv1d)\nMODELS.register_module('Conv2d', module=nn.Conv2d)\nMODELS.register_module('Conv3d', module=nn.Conv3d)\nMODELS.register_module('Conv', module=nn.Conv2d)\n\n\ndef build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:\n    \"\"\"Build convolution layer.\n\n    Args:\n        cfg (None or dict): The conv layer config, which should contain:\n            - type (str): Layer type.\n            - layer args: Args needed to instantiate an conv layer.\n        args (argument list): Arguments passed to the `__init__`\n            method of the corresponding conv layer.\n        kwargs (keyword arguments): Keyword arguments passed to the `__init__`\n            method of the corresponding conv layer.\n\n    Returns:\n        nn.Module: Created conv layer.\n    \"\"\"\n    if cfg is None:\n        cfg_ = dict(type='Conv2d')\n    else:\n        if not isinstance(cfg, dict):\n            raise TypeError('cfg must be a dict')\n        if 'type' not in cfg:\n            raise KeyError('the cfg dict must contain the key \"type\"')\n        cfg_ = cfg.copy()\n\n    layer_type = cfg_.pop('type')\n    if inspect.isclass(layer_type):\n        return layer_type(*args, **kwargs, **cfg_)  # type: ignore\n    # Switch registry to the target scope. If `conv_layer` cannot be found\n    # in the registry, fallback to search `conv_layer` in the\n    # mmengine.MODELS.\n    with MODELS.switch_scope_and_registry(None) as registry:\n        conv_layer = registry.get(layer_type)\n    if conv_layer is None:\n        raise KeyError(f'Cannot find {conv_layer} in registry under scope '\n                       f'name {registry.scope}')\n    layer = conv_layer(*args, **kwargs, **cfg_)\n\n    return layer\n"
  },
  {
    "path": "mmcv/cnn/bricks/conv2d_adaptive_padding.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nfrom typing import Tuple, Union\n\nimport torch\nfrom mmengine.registry import MODELS\nfrom torch import nn\nfrom torch.nn import functional as F\n\n\n@MODELS.register_module()\nclass Conv2dAdaptivePadding(nn.Conv2d):\n    \"\"\"Implementation of 2D convolution in tensorflow with `padding` as \"same\",\n    which applies padding to input (if needed) so that input image gets fully\n    covered by filter and stride you specified. For stride 1, this will ensure\n    that output image size is same as input. For stride of 2, output dimensions\n    will be half, for example.\n\n    Args:\n        in_channels (int): Number of channels in the input image\n        out_channels (int): Number of channels produced by the convolution\n        kernel_size (int or tuple): Size of the convolving kernel\n        stride (int or tuple, optional): Stride of the convolution. Default: 1\n        padding (int or tuple, optional): Zero-padding added to both sides of\n            the input. Default: 0\n        dilation (int or tuple, optional): Spacing between kernel elements.\n            Default: 1\n        groups (int, optional): Number of blocked connections from input\n            channels to output channels. Default: 1\n        bias (bool, optional): If ``True``, adds a learnable bias to the\n            output. Default: ``True``\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, int]],\n                 stride: Union[int, Tuple[int, int]] = 1,\n                 padding: Union[int, Tuple[int, int]] = 0,\n                 dilation: Union[int, Tuple[int, int]] = 1,\n                 groups: int = 1,\n                 bias: bool = True):\n        super().__init__(in_channels, out_channels, kernel_size, stride, 0,\n                         dilation, groups, bias)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        img_h, img_w = x.size()[-2:]\n        kernel_h, kernel_w = self.weight.size()[-2:]\n        stride_h, stride_w = self.stride\n        output_h = math.ceil(img_h / stride_h)\n        output_w = math.ceil(img_w / stride_w)\n        pad_h = (\n            max((output_h - 1) * self.stride[0] +\n                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))\n        pad_w = (\n            max((output_w - 1) * self.stride[1] +\n                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))\n        if pad_h > 0 or pad_w > 0:\n            x = F.pad(x, [\n                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2\n            ])\n        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,\n                        self.dilation, self.groups)\n"
  },
  {
    "path": "mmcv/cnn/bricks/conv_module.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom functools import partial\nfrom typing import Dict, Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.model import constant_init, kaiming_init\nfrom mmengine.registry import MODELS\nfrom mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm\n\nfrom .activation import build_activation_layer\nfrom .conv import build_conv_layer\nfrom .norm import build_norm_layer\nfrom .padding import build_padding_layer\n\n\ndef efficient_conv_bn_eval_forward(bn: _BatchNorm,\n                                   conv: nn.modules.conv._ConvNd,\n                                   x: torch.Tensor):\n    \"\"\"\n    Implementation based on https://arxiv.org/abs/2305.11624\n    \"Tune-Mode ConvBN Blocks For Efficient Transfer Learning\"\n    It leverages the associative law between convolution and affine transform,\n    i.e., normalize (weight conv feature) = (normalize weight) conv feature.\n    It works for Eval mode of ConvBN blocks during validation, and can be used\n    for training as well. It reduces memory and computation cost.\n\n    Args:\n        bn (_BatchNorm): a BatchNorm module.\n        conv (nn._ConvNd): a conv module\n        x (torch.Tensor): Input feature map.\n    \"\"\"\n    # These lines of code are designed to deal with various cases\n    # like bn without affine transform, and conv without bias\n    weight_on_the_fly = conv.weight\n    if conv.bias is not None:\n        bias_on_the_fly = conv.bias\n    else:\n        bias_on_the_fly = torch.zeros_like(bn.running_var)\n\n    if bn.weight is not None:\n        bn_weight = bn.weight\n    else:\n        bn_weight = torch.ones_like(bn.running_var)\n\n    if bn.bias is not None:\n        bn_bias = bn.bias\n    else:\n        bn_bias = torch.zeros_like(bn.running_var)\n\n    # shape of [C_out, 1, 1, 1] in Conv2d\n    weight_coeff = torch.rsqrt(bn.running_var +\n                               bn.eps).reshape([-1] + [1] *\n                                               (len(conv.weight.shape) - 1))\n    # shape of [C_out, 1, 1, 1] in Conv2d\n    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff\n\n    # shape of [C_out, C_in, k, k] in Conv2d\n    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly\n    # shape of [C_out] in Conv2d\n    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\\\n        (bias_on_the_fly - bn.running_mean)\n\n    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)\n\n\n@MODELS.register_module()\nclass ConvModule(nn.Module):\n    \"\"\"A conv block that bundles conv/norm/activation layers.\n\n    This block simplifies the usage of convolution layers, which are commonly\n    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).\n    It is based upon three build methods: `build_conv_layer()`,\n    `build_norm_layer()` and `build_activation_layer()`.\n\n    Besides, we add some additional features in this module.\n    1. Automatically set `bias` of the conv layer.\n    2. Spectral norm is supported.\n    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only\n    supports zero and circular padding, and we add \"reflect\" padding mode.\n\n    Args:\n        in_channels (int): Number of channels in the input feature map.\n            Same as that in ``nn._ConvNd``.\n        out_channels (int): Number of channels produced by the convolution.\n            Same as that in ``nn._ConvNd``.\n        kernel_size (int | tuple[int]): Size of the convolving kernel.\n            Same as that in ``nn._ConvNd``.\n        stride (int | tuple[int]): Stride of the convolution.\n            Same as that in ``nn._ConvNd``.\n        padding (int | tuple[int]): Zero-padding added to both sides of\n            the input. Same as that in ``nn._ConvNd``.\n        dilation (int | tuple[int]): Spacing between kernel elements.\n            Same as that in ``nn._ConvNd``.\n        groups (int): Number of blocked connections from input channels to\n            output channels. Same as that in ``nn._ConvNd``.\n        bias (bool | str): If specified as `auto`, it will be decided by the\n            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise\n            False. Default: \"auto\".\n        conv_cfg (dict): Config dict for convolution layer. Default: None,\n            which means using conv2d.\n        norm_cfg (dict): Config dict for normalization layer. Default: None.\n        act_cfg (dict): Config dict for activation layer.\n            Default: dict(type='ReLU').\n        inplace (bool): Whether to use inplace mode for activation.\n            Default: True.\n        with_spectral_norm (bool): Whether use spectral norm in conv module.\n            Default: False.\n        padding_mode (str): If the `padding_mode` has not been supported by\n            current `Conv2d` in PyTorch, we will use our own padding layer\n            instead. Currently, we support ['zeros', 'circular'] with official\n            implementation and ['reflect'] with our own implementation.\n            Default: 'zeros'.\n        order (tuple[str]): The order of conv/norm/activation layers. It is a\n            sequence of \"conv\", \"norm\" and \"act\". Common examples are\n            (\"conv\", \"norm\", \"act\") and (\"act\", \"conv\", \"norm\").\n            Default: ('conv', 'norm', 'act').\n        efficient_conv_bn_eval (bool): Whether use efficient conv when the\n            consecutive bn is in eval mode (either training or testing), as\n            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.\n    \"\"\"\n\n    _abbr_ = 'conv_block'\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, int]],\n                 stride: Union[int, Tuple[int, int]] = 1,\n                 padding: Union[int, Tuple[int, int]] = 0,\n                 dilation: Union[int, Tuple[int, int]] = 1,\n                 groups: int = 1,\n                 bias: Union[bool, str] = 'auto',\n                 conv_cfg: Optional[Dict] = None,\n                 norm_cfg: Optional[Dict] = None,\n                 act_cfg: Optional[Dict] = dict(type='ReLU'),\n                 inplace: bool = True,\n                 with_spectral_norm: bool = False,\n                 padding_mode: str = 'zeros',\n                 order: tuple = ('conv', 'norm', 'act'),\n                 efficient_conv_bn_eval: bool = False):\n        super().__init__()\n        assert conv_cfg is None or isinstance(conv_cfg, dict)\n        assert norm_cfg is None or isinstance(norm_cfg, dict)\n        assert act_cfg is None or isinstance(act_cfg, dict)\n        official_padding_mode = ['zeros', 'circular']\n        self.conv_cfg = conv_cfg\n        self.norm_cfg = norm_cfg\n        self.act_cfg = act_cfg\n        self.inplace = inplace\n        self.with_spectral_norm = with_spectral_norm\n        self.with_explicit_padding = padding_mode not in official_padding_mode\n        self.order = order\n        assert isinstance(self.order, tuple) and len(self.order) == 3\n        assert set(order) == {'conv', 'norm', 'act'}\n\n        self.with_norm = norm_cfg is not None\n        self.with_activation = act_cfg is not None\n        # if the conv layer is before a norm layer, bias is unnecessary.\n        if bias == 'auto':\n            bias = not self.with_norm\n        self.with_bias = bias\n\n        if self.with_explicit_padding:\n            pad_cfg = dict(type=padding_mode)\n            self.padding_layer = build_padding_layer(pad_cfg, padding)\n\n        # reset padding to 0 for conv module\n        conv_padding = 0 if self.with_explicit_padding else padding\n        # build convolution layer\n        self.conv = build_conv_layer(\n            conv_cfg,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=conv_padding,\n            dilation=dilation,\n            groups=groups,\n            bias=bias)\n        # export the attributes of self.conv to a higher level for convenience\n        self.in_channels = self.conv.in_channels\n        self.out_channels = self.conv.out_channels\n        self.kernel_size = self.conv.kernel_size\n        self.stride = self.conv.stride\n        self.padding = padding\n        self.dilation = self.conv.dilation\n        self.transposed = self.conv.transposed\n        self.output_padding = self.conv.output_padding\n        self.groups = self.conv.groups\n\n        if self.with_spectral_norm:\n            self.conv = nn.utils.spectral_norm(self.conv)\n\n        # build normalization layers\n        if self.with_norm:\n            # norm layer is after conv layer\n            if order.index('norm') > order.index('conv'):\n                norm_channels = out_channels\n            else:\n                norm_channels = in_channels\n            self.norm_name, norm = build_norm_layer(\n                norm_cfg, norm_channels)  # type: ignore\n            self.add_module(self.norm_name, norm)\n            if self.with_bias:\n                if isinstance(norm, (_BatchNorm, _InstanceNorm)):\n                    warnings.warn(\n                        'Unnecessary conv bias before batch/instance norm')\n        else:\n            self.norm_name = None  # type: ignore\n\n        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)\n\n        # build activation layer\n        if self.with_activation:\n            act_cfg_ = act_cfg.copy()  # type: ignore\n            # nn.Tanh has no 'inplace' argument\n            if act_cfg_['type'] not in [\n                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'\n            ]:\n                act_cfg_.setdefault('inplace', inplace)\n            self.activate = build_activation_layer(act_cfg_)\n\n        # Use msra init by default\n        self.init_weights()\n\n    @property\n    def norm(self):\n        if self.norm_name:\n            return getattr(self, self.norm_name)\n        else:\n            return None\n\n    def init_weights(self):\n        # 1. It is mainly for customized conv layers with their own\n        #    initialization manners by calling their own ``init_weights()``,\n        #    and we do not want ConvModule to override the initialization.\n        # 2. For customized conv layers without their own initialization\n        #    manners (that is, they don't have their own ``init_weights()``)\n        #    and PyTorch's conv layers, they will be initialized by\n        #    this method with default ``kaiming_init``.\n        # Note: For PyTorch's conv layers, they will be overwritten by our\n        #    initialization implementation using default ``kaiming_init``.\n        if not hasattr(self.conv, 'init_weights'):\n            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':\n                nonlinearity = 'leaky_relu'\n                a = self.act_cfg.get('negative_slope', 0.01)\n            else:\n                nonlinearity = 'relu'\n                a = 0\n            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)\n        if self.with_norm:\n            constant_init(self.norm, 1, bias=0)\n\n    def forward(self,\n                x: torch.Tensor,\n                activate: bool = True,\n                norm: bool = True) -> torch.Tensor:\n        layer_index = 0\n        while layer_index < len(self.order):\n            layer = self.order[layer_index]\n            if layer == 'conv':\n                if self.with_explicit_padding:\n                    x = self.padding_layer(x)\n                # if the next operation is norm and we have a norm layer in\n                # eval mode and we have enabled `efficient_conv_bn_eval` for\n                # the conv operator, then activate the optimized forward and\n                # skip the next norm operator since it has been fused\n                if layer_index + 1 < len(self.order) and \\\n                        self.order[layer_index + 1] == 'norm' and norm and \\\n                        self.with_norm and not self.norm.training and \\\n                        self.efficient_conv_bn_eval_forward is not None:\n                    self.conv.forward = partial(\n                        self.efficient_conv_bn_eval_forward, self.norm,\n                        self.conv)\n                    layer_index += 1\n                    x = self.conv(x)\n                    del self.conv.forward\n                else:\n                    x = self.conv(x)\n            elif layer == 'norm' and norm and self.with_norm:\n                x = self.norm(x)\n            elif layer == 'act' and activate and self.with_activation:\n                x = self.activate(x)\n            layer_index += 1\n        return x\n\n    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):\n        # efficient_conv_bn_eval works for conv + bn\n        # with `track_running_stats` option\n        if efficient_conv_bn_eval and self.norm \\\n                            and isinstance(self.norm, _BatchNorm) \\\n                            and self.norm.track_running_stats:\n            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward  # noqa: E501\n        else:\n            self.efficient_conv_bn_eval_forward = None  # type: ignore\n\n    @staticmethod\n    def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,\n                            bn: torch.nn.modules.batchnorm._BatchNorm,\n                            efficient_conv_bn_eval=True) -> 'ConvModule':\n        \"\"\"Create a ConvModule from a conv and a bn module.\"\"\"\n        self = ConvModule.__new__(ConvModule)\n        super(ConvModule, self).__init__()\n\n        self.conv_cfg = None\n        self.norm_cfg = None\n        self.act_cfg = None\n        self.inplace = False\n        self.with_spectral_norm = False\n        self.with_explicit_padding = False\n        self.order = ('conv', 'norm', 'act')\n\n        self.with_norm = True\n        self.with_activation = False\n        self.with_bias = conv.bias is not None\n\n        # build convolution layer\n        self.conv = conv\n        # export the attributes of self.conv to a higher level for convenience\n        self.in_channels = self.conv.in_channels\n        self.out_channels = self.conv.out_channels\n        self.kernel_size = self.conv.kernel_size\n        self.stride = self.conv.stride\n        self.padding = self.conv.padding\n        self.dilation = self.conv.dilation\n        self.transposed = self.conv.transposed\n        self.output_padding = self.conv.output_padding\n        self.groups = self.conv.groups\n\n        # build normalization layers\n        self.norm_name, norm = 'bn', bn\n        self.add_module(self.norm_name, norm)\n\n        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)\n\n        return self\n"
  },
  {
    "path": "mmcv/cnn/bricks/conv_ws.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom collections import OrderedDict\nfrom typing import Dict, List, Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.registry import MODELS\n\n\ndef conv_ws_2d(input: torch.Tensor,\n               weight: torch.Tensor,\n               bias: Optional[torch.Tensor] = None,\n               stride: Union[int, Tuple[int, int]] = 1,\n               padding: Union[int, Tuple[int, int]] = 0,\n               dilation: Union[int, Tuple[int, int]] = 1,\n               groups: int = 1,\n               eps: float = 1e-5) -> torch.Tensor:\n    c_in = weight.size(0)\n    weight_flat = weight.view(c_in, -1)\n    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)\n    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)\n    weight = (weight - mean) / (std + eps)\n    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)\n\n\n@MODELS.register_module('ConvWS')\nclass ConvWS2d(nn.Conv2d):\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, int]],\n                 stride: Union[int, Tuple[int, int]] = 1,\n                 padding: Union[int, Tuple[int, int]] = 0,\n                 dilation: Union[int, Tuple[int, int]] = 1,\n                 groups: int = 1,\n                 bias: bool = True,\n                 eps: float = 1e-5):\n        super().__init__(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias=bias)\n        self.eps = eps\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,\n                          self.dilation, self.groups, self.eps)\n\n\n@MODELS.register_module(name='ConvAWS')\nclass ConvAWS2d(nn.Conv2d):\n    \"\"\"AWS (Adaptive Weight Standardization)\n\n    This is a variant of Weight Standardization\n    (https://arxiv.org/pdf/1903.10520.pdf)\n    It is used in DetectoRS to avoid NaN\n    (https://arxiv.org/pdf/2006.02334.pdf)\n\n    Args:\n        in_channels (int): Number of channels in the input image\n        out_channels (int): Number of channels produced by the convolution\n        kernel_size (int or tuple): Size of the conv kernel\n        stride (int or tuple, optional): Stride of the convolution. Default: 1\n        padding (int or tuple, optional): Zero-padding added to both sides of\n            the input. Default: 0\n        dilation (int or tuple, optional): Spacing between kernel elements.\n            Default: 1\n        groups (int, optional): Number of blocked connections from input\n            channels to output channels. Default: 1\n        bias (bool, optional): If set True, adds a learnable bias to the\n            output. Default: True\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, int]],\n                 stride: Union[int, Tuple[int, int]] = 1,\n                 padding: Union[int, Tuple[int, int]] = 0,\n                 dilation: Union[int, Tuple[int, int]] = 1,\n                 groups: int = 1,\n                 bias: bool = True):\n        super().__init__(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias=bias)\n        self.register_buffer('weight_gamma',\n                             torch.ones(self.out_channels, 1, 1, 1))\n        self.register_buffer('weight_beta',\n                             torch.zeros(self.out_channels, 1, 1, 1))\n\n    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:\n        weight_flat = weight.view(weight.size(0), -1)\n        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)\n        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)\n        weight = (weight - mean) / std\n        weight = self.weight_gamma * weight + self.weight_beta\n        return weight\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        weight = self._get_weight(self.weight)\n        return F.conv2d(x, weight, self.bias, self.stride, self.padding,\n                        self.dilation, self.groups)\n\n    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,\n                              local_metadata: Dict, strict: bool,\n                              missing_keys: List[str],\n                              unexpected_keys: List[str],\n                              error_msgs: List[str]) -> None:\n        \"\"\"Override default load function.\n\n        AWS overrides the function _load_from_state_dict to recover\n        weight_gamma and weight_beta if they are missing. If weight_gamma and\n        weight_beta are found in the checkpoint, this function will return\n        after super()._load_from_state_dict. Otherwise, it will compute the\n        mean and std of the pretrained weights and store them in weight_beta\n        and weight_gamma.\n        \"\"\"\n\n        self.weight_gamma.data.fill_(-1)\n        local_missing_keys: List = []\n        super()._load_from_state_dict(state_dict, prefix, local_metadata,\n                                      strict, local_missing_keys,\n                                      unexpected_keys, error_msgs)\n        if self.weight_gamma.data.mean() > 0:\n            for k in local_missing_keys:\n                missing_keys.append(k)\n            return\n        weight = self.weight.data\n        weight_flat = weight.view(weight.size(0), -1)\n        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)\n        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)\n        self.weight_beta.data.copy_(mean)\n        self.weight_gamma.data.copy_(std)\n        missing_gamma_beta = [\n            k for k in local_missing_keys\n            if k.endswith('weight_gamma') or k.endswith('weight_beta')\n        ]\n        for k in missing_gamma_beta:\n            local_missing_keys.remove(k)\n        for k in local_missing_keys:\n            missing_keys.append(k)\n"
  },
  {
    "path": "mmcv/cnn/bricks/depthwise_separable_conv_module.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Dict, Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\n\nfrom .conv_module import ConvModule\n\n\nclass DepthwiseSeparableConvModule(nn.Module):\n    \"\"\"Depthwise separable convolution module.\n\n    See https://arxiv.org/pdf/1704.04861.pdf for details.\n\n    This module can replace a ConvModule with the conv block replaced by two\n    conv block: depthwise conv block and pointwise conv block. The depthwise\n    conv block contains depthwise-conv/norm/activation layers. The pointwise\n    conv block contains pointwise-conv/norm/activation layers. It should be\n    noted that there will be norm/activation layer in the depthwise conv block\n    if `norm_cfg` and `act_cfg` are specified.\n\n    Args:\n        in_channels (int): Number of channels in the input feature map.\n            Same as that in ``nn._ConvNd``.\n        out_channels (int): Number of channels produced by the convolution.\n            Same as that in ``nn._ConvNd``.\n        kernel_size (int | tuple[int]): Size of the convolving kernel.\n            Same as that in ``nn._ConvNd``.\n        stride (int | tuple[int]): Stride of the convolution.\n            Same as that in ``nn._ConvNd``. Default: 1.\n        padding (int | tuple[int]): Zero-padding added to both sides of\n            the input. Same as that in ``nn._ConvNd``. Default: 0.\n        dilation (int | tuple[int]): Spacing between kernel elements.\n            Same as that in ``nn._ConvNd``. Default: 1.\n        norm_cfg (dict): Default norm config for both depthwise ConvModule and\n            pointwise ConvModule. Default: None.\n        act_cfg (dict): Default activation config for both depthwise ConvModule\n            and pointwise ConvModule. Default: dict(type='ReLU').\n        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is\n            'default', it will be the same as `norm_cfg`. Default: 'default'.\n        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is\n            'default', it will be the same as `act_cfg`. Default: 'default'.\n        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is\n            'default', it will be the same as `norm_cfg`. Default: 'default'.\n        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is\n            'default', it will be the same as `act_cfg`. Default: 'default'.\n        kwargs (optional): Other shared arguments for depthwise and pointwise\n            ConvModule. See ConvModule for ref.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, int]],\n                 stride: Union[int, Tuple[int, int]] = 1,\n                 padding: Union[int, Tuple[int, int]] = 0,\n                 dilation: Union[int, Tuple[int, int]] = 1,\n                 norm_cfg: Optional[Dict] = None,\n                 act_cfg: Dict = dict(type='ReLU'),\n                 dw_norm_cfg: Union[Dict, str] = 'default',\n                 dw_act_cfg: Union[Dict, str] = 'default',\n                 pw_norm_cfg: Union[Dict, str] = 'default',\n                 pw_act_cfg: Union[Dict, str] = 'default',\n                 **kwargs):\n        super().__init__()\n        assert 'groups' not in kwargs, 'groups should not be specified'\n\n        # if norm/activation config of depthwise/pointwise ConvModule is not\n        # specified, use default config.\n        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501\n        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg\n        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501\n        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg\n\n        # depthwise convolution\n        self.depthwise_conv = ConvModule(\n            in_channels,\n            in_channels,\n            kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=in_channels,\n            norm_cfg=dw_norm_cfg,  # type: ignore\n            act_cfg=dw_act_cfg,  # type: ignore\n            **kwargs)\n\n        self.pointwise_conv = ConvModule(\n            in_channels,\n            out_channels,\n            1,\n            norm_cfg=pw_norm_cfg,  # type: ignore\n            act_cfg=pw_act_cfg,  # type: ignore\n            **kwargs)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        x = self.depthwise_conv(x)\n        x = self.pointwise_conv(x)\n        return x\n"
  },
  {
    "path": "mmcv/cnn/bricks/drop.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, Dict, Optional\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\n\n\ndef drop_path(x: torch.Tensor,\n              drop_prob: float = 0.,\n              training: bool = False) -> torch.Tensor:\n    \"\"\"Drop paths (Stochastic Depth) per sample (when applied in main path of\n    residual blocks).\n\n    We follow the implementation\n    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py\n    # noqa: E501\n    \"\"\"\n    if drop_prob == 0. or not training:\n        return x\n    keep_prob = 1 - drop_prob\n    # handle tensors with different dimensions, not just 4D tensors.\n    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)\n    random_tensor = keep_prob + torch.rand(\n        shape, dtype=x.dtype, device=x.device)\n    output = x.div(keep_prob) * random_tensor.floor()\n    return output\n\n\n@MODELS.register_module()\nclass DropPath(nn.Module):\n    \"\"\"Drop paths (Stochastic Depth) per sample  (when applied in main path of\n    residual blocks).\n\n    We follow the implementation\n    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501\n\n    Args:\n        drop_prob (float): Probability of the path to be zeroed. Default: 0.1\n    \"\"\"\n\n    def __init__(self, drop_prob: float = 0.1):\n        super().__init__()\n        self.drop_prob = drop_prob\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return drop_path(x, self.drop_prob, self.training)\n\n\n@MODELS.register_module()\nclass Dropout(nn.Dropout):\n    \"\"\"A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of\n    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with\n    ``DropPath``\n\n    Args:\n        drop_prob (float): Probability of the elements to be\n            zeroed. Default: 0.5.\n        inplace (bool):  Do the operation inplace or not. Default: False.\n    \"\"\"\n\n    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):\n        super().__init__(p=drop_prob, inplace=inplace)\n\n\ndef build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:\n    \"\"\"Builder for drop out layers.\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n"
  },
  {
    "path": "mmcv/cnn/bricks/generalized_attention.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.model import kaiming_init\nfrom mmengine.registry import MODELS\n\n\n@MODELS.register_module()\nclass GeneralizedAttention(nn.Module):\n    \"\"\"GeneralizedAttention module.\n\n    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'\n    (https://arxiv.org/abs/1904.05873) for details.\n\n    Args:\n        in_channels (int): Channels of the input feature map.\n        spatial_range (int): The spatial range. -1 indicates no spatial range\n            constraint. Default: -1.\n        num_heads (int): The head number of empirical_attention module.\n            Default: 9.\n        position_embedding_dim (int): The position embedding dimension.\n            Default: -1.\n        position_magnitude (int): A multiplier acting on coord difference.\n            Default: 1.\n        kv_stride (int): The feature stride acting on key/value feature map.\n            Default: 2.\n        q_stride (int): The feature stride acting on query feature map.\n            Default: 1.\n        attention_type (str): A binary indicator string for indicating which\n            items in generalized empirical_attention module are used.\n            Default: '1111'.\n\n            - '1000' indicates 'query and key content' (appr - appr) item,\n            - '0100' indicates 'query content and relative position'\n              (appr - position) item,\n            - '0010' indicates 'key content only' (bias - appr) item,\n            - '0001' indicates 'relative position only' (bias - position) item.\n    \"\"\"\n\n    _abbr_ = 'gen_attention_block'\n\n    def __init__(self,\n                 in_channels: int,\n                 spatial_range: int = -1,\n                 num_heads: int = 9,\n                 position_embedding_dim: int = -1,\n                 position_magnitude: int = 1,\n                 kv_stride: int = 2,\n                 q_stride: int = 1,\n                 attention_type: str = '1111'):\n\n        super().__init__()\n\n        # hard range means local range for non-local operation\n        self.position_embedding_dim = (\n            position_embedding_dim\n            if position_embedding_dim > 0 else in_channels)\n\n        self.position_magnitude = position_magnitude\n        self.num_heads = num_heads\n        self.in_channels = in_channels\n        self.spatial_range = spatial_range\n        self.kv_stride = kv_stride\n        self.q_stride = q_stride\n        self.attention_type = [bool(int(_)) for _ in attention_type]\n        self.qk_embed_dim = in_channels // num_heads\n        out_c = self.qk_embed_dim * num_heads\n\n        if self.attention_type[0] or self.attention_type[1]:\n            self.query_conv = nn.Conv2d(\n                in_channels=in_channels,\n                out_channels=out_c,\n                kernel_size=1,\n                bias=False)\n            self.query_conv.kaiming_init = True\n\n        if self.attention_type[0] or self.attention_type[2]:\n            self.key_conv = nn.Conv2d(\n                in_channels=in_channels,\n                out_channels=out_c,\n                kernel_size=1,\n                bias=False)\n            self.key_conv.kaiming_init = True\n\n        self.v_dim = in_channels // num_heads\n        self.value_conv = nn.Conv2d(\n            in_channels=in_channels,\n            out_channels=self.v_dim * num_heads,\n            kernel_size=1,\n            bias=False)\n        self.value_conv.kaiming_init = True\n\n        if self.attention_type[1] or self.attention_type[3]:\n            self.appr_geom_fc_x = nn.Linear(\n                self.position_embedding_dim // 2, out_c, bias=False)\n            self.appr_geom_fc_x.kaiming_init = True\n\n            self.appr_geom_fc_y = nn.Linear(\n                self.position_embedding_dim // 2, out_c, bias=False)\n            self.appr_geom_fc_y.kaiming_init = True\n\n        if self.attention_type[2]:\n            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)\n            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv\n            self.appr_bias = nn.Parameter(appr_bias_value)\n\n        if self.attention_type[3]:\n            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)\n            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv\n            self.geom_bias = nn.Parameter(geom_bias_value)\n\n        self.proj_conv = nn.Conv2d(\n            in_channels=self.v_dim * num_heads,\n            out_channels=in_channels,\n            kernel_size=1,\n            bias=True)\n        self.proj_conv.kaiming_init = True\n        self.gamma = nn.Parameter(torch.zeros(1))\n\n        if self.spatial_range >= 0:\n            # only works when non local is after 3*3 conv\n            if in_channels == 256:\n                max_len = 84\n            elif in_channels == 512:\n                max_len = 42\n\n            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)\n            local_constraint_map = np.ones(\n                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)\n            for iy in range(max_len):\n                for ix in range(max_len):\n                    local_constraint_map[\n                        iy, ix,\n                        max((iy - self.spatial_range) //\n                            self.kv_stride, 0):min((iy + self.spatial_range +\n                                                    1) // self.kv_stride +\n                                                   1, max_len),\n                        max((ix - self.spatial_range) //\n                            self.kv_stride, 0):min((ix + self.spatial_range +\n                                                    1) // self.kv_stride +\n                                                   1, max_len)] = 0\n\n            self.local_constraint_map = nn.Parameter(\n                torch.from_numpy(local_constraint_map).byte(),\n                requires_grad=False)\n\n        if self.q_stride > 1:\n            self.q_downsample = nn.AvgPool2d(\n                kernel_size=1, stride=self.q_stride)\n        else:\n            self.q_downsample = None\n\n        if self.kv_stride > 1:\n            self.kv_downsample = nn.AvgPool2d(\n                kernel_size=1, stride=self.kv_stride)\n        else:\n            self.kv_downsample = None\n\n        self.init_weights()\n\n    def get_position_embedding(self,\n                               h,\n                               w,\n                               h_kv,\n                               w_kv,\n                               q_stride,\n                               kv_stride,\n                               device,\n                               dtype,\n                               feat_dim,\n                               wave_length=1000):\n        # the default type of Tensor is float32, leading to type mismatch\n        # in fp16 mode. Cast it to support fp16 mode.\n        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)\n        h_idxs = h_idxs.view((h, 1)) * q_stride\n\n        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)\n        w_idxs = w_idxs.view((w, 1)) * q_stride\n\n        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(\n            device=device, dtype=dtype)\n        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride\n\n        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(\n            device=device, dtype=dtype)\n        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride\n\n        # (h, h_kv, 1)\n        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)\n        h_diff *= self.position_magnitude\n\n        # (w, w_kv, 1)\n        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)\n        w_diff *= self.position_magnitude\n\n        feat_range = torch.arange(0, feat_dim / 4).to(\n            device=device, dtype=dtype)\n\n        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)\n        dim_mat = dim_mat**((4. / feat_dim) * feat_range)\n        dim_mat = dim_mat.view((1, 1, -1))\n\n        embedding_x = torch.cat(\n            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)\n\n        embedding_y = torch.cat(\n            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)\n\n        return embedding_x, embedding_y\n\n    def forward(self, x_input: torch.Tensor) -> torch.Tensor:\n        num_heads = self.num_heads\n\n        # use empirical_attention\n        if self.q_downsample is not None:\n            x_q = self.q_downsample(x_input)\n        else:\n            x_q = x_input\n        n, _, h, w = x_q.shape\n\n        if self.kv_downsample is not None:\n            x_kv = self.kv_downsample(x_input)\n        else:\n            x_kv = x_input\n        _, _, h_kv, w_kv = x_kv.shape\n\n        if self.attention_type[0] or self.attention_type[1]:\n            proj_query = self.query_conv(x_q).view(\n                (n, num_heads, self.qk_embed_dim, h * w))\n            proj_query = proj_query.permute(0, 1, 3, 2)\n\n        if self.attention_type[0] or self.attention_type[2]:\n            proj_key = self.key_conv(x_kv).view(\n                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))\n\n        if self.attention_type[1] or self.attention_type[3]:\n            position_embed_x, position_embed_y = self.get_position_embedding(\n                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,\n                x_input.device, x_input.dtype, self.position_embedding_dim)\n            # (n, num_heads, w, w_kv, dim)\n            position_feat_x = self.appr_geom_fc_x(position_embed_x).\\\n                view(1, w, w_kv, num_heads, self.qk_embed_dim).\\\n                permute(0, 3, 1, 2, 4).\\\n                repeat(n, 1, 1, 1, 1)\n\n            # (n, num_heads, h, h_kv, dim)\n            position_feat_y = self.appr_geom_fc_y(position_embed_y).\\\n                view(1, h, h_kv, num_heads, self.qk_embed_dim).\\\n                permute(0, 3, 1, 2, 4).\\\n                repeat(n, 1, 1, 1, 1)\n\n            position_feat_x /= math.sqrt(2)\n            position_feat_y /= math.sqrt(2)\n\n        # accelerate for saliency only\n        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:\n            appr_bias = self.appr_bias.\\\n                view(1, num_heads, 1, self.qk_embed_dim).\\\n                repeat(n, 1, 1, 1)\n\n            energy = torch.matmul(appr_bias, proj_key).\\\n                view(n, num_heads, 1, h_kv * w_kv)\n\n            h = 1\n            w = 1\n        else:\n            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for\n            if not self.attention_type[0]:\n                energy = torch.zeros(\n                    n,\n                    num_heads,\n                    h,\n                    w,\n                    h_kv,\n                    w_kv,\n                    dtype=x_input.dtype,\n                    device=x_input.device)\n\n            # attention_type[0]: appr - appr\n            # attention_type[1]: appr - position\n            # attention_type[2]: bias - appr\n            # attention_type[3]: bias - position\n            if self.attention_type[0] or self.attention_type[2]:\n                if self.attention_type[0] and self.attention_type[2]:\n                    appr_bias = self.appr_bias.\\\n                        view(1, num_heads, 1, self.qk_embed_dim)\n                    energy = torch.matmul(proj_query + appr_bias, proj_key).\\\n                        view(n, num_heads, h, w, h_kv, w_kv)\n\n                elif self.attention_type[0]:\n                    energy = torch.matmul(proj_query, proj_key).\\\n                        view(n, num_heads, h, w, h_kv, w_kv)\n\n                elif self.attention_type[2]:\n                    appr_bias = self.appr_bias.\\\n                        view(1, num_heads, 1, self.qk_embed_dim).\\\n                        repeat(n, 1, 1, 1)\n\n                    energy += torch.matmul(appr_bias, proj_key).\\\n                        view(n, num_heads, 1, 1, h_kv, w_kv)\n\n            if self.attention_type[1] or self.attention_type[3]:\n                if self.attention_type[1] and self.attention_type[3]:\n                    geom_bias = self.geom_bias.\\\n                        view(1, num_heads, 1, self.qk_embed_dim)\n\n                    proj_query_reshape = (proj_query + geom_bias).\\\n                        view(n, num_heads, h, w, self.qk_embed_dim)\n\n                    energy_x = torch.matmul(\n                        proj_query_reshape.permute(0, 1, 3, 2, 4),\n                        position_feat_x.permute(0, 1, 2, 4, 3))\n                    energy_x = energy_x.\\\n                        permute(0, 1, 3, 2, 4).unsqueeze(4)\n\n                    energy_y = torch.matmul(\n                        proj_query_reshape,\n                        position_feat_y.permute(0, 1, 2, 4, 3))\n                    energy_y = energy_y.unsqueeze(5)\n\n                    energy += energy_x + energy_y\n\n                elif self.attention_type[1]:\n                    proj_query_reshape = proj_query.\\\n                        view(n, num_heads, h, w, self.qk_embed_dim)\n                    proj_query_reshape = proj_query_reshape.\\\n                        permute(0, 1, 3, 2, 4)\n                    position_feat_x_reshape = position_feat_x.\\\n                        permute(0, 1, 2, 4, 3)\n                    position_feat_y_reshape = position_feat_y.\\\n                        permute(0, 1, 2, 4, 3)\n\n                    energy_x = torch.matmul(proj_query_reshape,\n                                            position_feat_x_reshape)\n                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)\n\n                    energy_y = torch.matmul(proj_query_reshape,\n                                            position_feat_y_reshape)\n                    energy_y = energy_y.unsqueeze(5)\n\n                    energy += energy_x + energy_y\n\n                elif self.attention_type[3]:\n                    geom_bias = self.geom_bias.\\\n                        view(1, num_heads, self.qk_embed_dim, 1).\\\n                        repeat(n, 1, 1, 1)\n\n                    position_feat_x_reshape = position_feat_x.\\\n                        view(n, num_heads, w * w_kv, self.qk_embed_dim)\n\n                    position_feat_y_reshape = position_feat_y.\\\n                        view(n, num_heads, h * h_kv, self.qk_embed_dim)\n\n                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)\n                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)\n\n                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)\n                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)\n\n                    energy += energy_x + energy_y\n\n            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)\n\n        if self.spatial_range >= 0:\n            cur_local_constraint_map = \\\n                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\\\n                contiguous().\\\n                view(1, 1, h*w, h_kv*w_kv)\n\n            energy = energy.masked_fill_(cur_local_constraint_map.bool(),\n                                         float('-inf'))\n\n        attention = F.softmax(energy, 3)\n\n        proj_value = self.value_conv(x_kv)\n        proj_value_reshape = proj_value.\\\n            view((n, num_heads, self.v_dim, h_kv * w_kv)).\\\n            permute(0, 1, 3, 2)\n\n        out = torch.matmul(attention, proj_value_reshape).\\\n            permute(0, 1, 3, 2).\\\n            contiguous().\\\n            view(n, self.v_dim * self.num_heads, h, w)\n\n        out = self.proj_conv(out)\n\n        # output is downsampled, upsample back to input size\n        if self.q_downsample is not None:\n            out = F.interpolate(\n                out,\n                size=x_input.shape[2:],\n                mode='bilinear',\n                align_corners=False)\n\n        out = self.gamma * out + x_input\n        return out\n\n    def init_weights(self):\n        for m in self.modules():\n            if hasattr(m, 'kaiming_init') and m.kaiming_init:\n                kaiming_init(\n                    m,\n                    mode='fan_in',\n                    nonlinearity='leaky_relu',\n                    bias=0,\n                    distribution='uniform',\n                    a=1)\n"
  },
  {
    "path": "mmcv/cnn/bricks/hsigmoid.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\n\n\n@MODELS.register_module()\nclass HSigmoid(nn.Module):\n    \"\"\"Hard Sigmoid Module. Apply the hard sigmoid function:\n    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)\n    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)\n\n    Note:\n        In MMCV v1.4.4, we modified the default value of args to align with\n        PyTorch official.\n\n    Args:\n        bias (float): Bias of the input feature map. Default: 3.0.\n        divisor (float): Divisor of the input feature map. Default: 6.0.\n        min_value (float): Lower bound value. Default: 0.0.\n        max_value (float): Upper bound value. Default: 1.0.\n\n    Returns:\n        Tensor: The output tensor.\n    \"\"\"\n\n    def __init__(self,\n                 bias: float = 3.0,\n                 divisor: float = 6.0,\n                 min_value: float = 0.0,\n                 max_value: float = 1.0):\n        super().__init__()\n        warnings.warn(\n            'In MMCV v1.4.4, we modified the default value of args to align '\n            'with PyTorch official. Previous Implementation: '\n            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '\n            'Current Implementation: '\n            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')\n        self.bias = bias\n        self.divisor = divisor\n        assert self.divisor != 0\n        self.min_value = min_value\n        self.max_value = max_value\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        x = (x + self.bias) / self.divisor\n\n        return x.clamp_(self.min_value, self.max_value)\n"
  },
  {
    "path": "mmcv/cnn/bricks/hswish.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\n\nclass HSwish(nn.Module):\n    \"\"\"Hard Swish Module.\n\n    This module applies the hard swish function:\n\n    .. math::\n        Hswish(x) = x * ReLU6(x + 3) / 6\n\n    Args:\n        inplace (bool): can optionally do the operation in-place.\n            Default: False.\n\n    Returns:\n        Tensor: The output tensor.\n    \"\"\"\n\n    def __init__(self, inplace: bool = False):\n        super().__init__()\n        self.act = nn.ReLU6(inplace)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return x * self.act(x + 3) / 6\n\n\nif (TORCH_VERSION == 'parrots'\n        or digit_version(TORCH_VERSION) < digit_version('1.7')):\n    # Hardswish is not supported when PyTorch version < 1.6.\n    # And Hardswish in PyTorch 1.6 does not support inplace.\n    MODELS.register_module(module=HSwish)\nelse:\n    MODELS.register_module(module=nn.Hardswish, name='HSwish')\n"
  },
  {
    "path": "mmcv/cnn/bricks/non_local.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom abc import ABCMeta\nfrom typing import Dict, Optional\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.model import constant_init, normal_init\nfrom mmengine.registry import MODELS\n\nfrom .conv_module import ConvModule\n\n\nclass _NonLocalNd(nn.Module, metaclass=ABCMeta):\n    \"\"\"Basic Non-local module.\n\n    This module is proposed in\n    \"Non-local Neural Networks\"\n    Paper reference: https://arxiv.org/abs/1711.07971\n    Code reference: https://github.com/AlexHex7/Non-local_pytorch\n\n    Args:\n        in_channels (int): Channels of the input feature map.\n        reduction (int): Channel reduction ratio. Default: 2.\n        use_scale (bool): Whether to scale pairwise_weight by\n            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.\n            Default: True.\n        conv_cfg (None | dict): The config dict for convolution layers.\n            If not specified, it will use `nn.Conv2d` for convolution layers.\n            Default: None.\n        norm_cfg (None | dict): The config dict for normalization layers.\n            Default: None. (This parameter is only applicable to conv_out.)\n        mode (str): Options are `gaussian`, `concatenation`,\n            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 reduction: int = 2,\n                 use_scale: bool = True,\n                 conv_cfg: Optional[Dict] = None,\n                 norm_cfg: Optional[Dict] = None,\n                 mode: str = 'embedded_gaussian',\n                 **kwargs):\n        super().__init__()\n        self.in_channels = in_channels\n        self.reduction = reduction\n        self.use_scale = use_scale\n        self.inter_channels = max(in_channels // reduction, 1)\n        self.mode = mode\n\n        if mode not in [\n                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'\n        ]:\n            raise ValueError(\"Mode should be in 'gaussian', 'concatenation', \"\n                             f\"'embedded_gaussian' or 'dot_product', but got \"\n                             f'{mode} instead.')\n\n        # g, theta, phi are defaulted as `nn.ConvNd`.\n        # Here we use ConvModule for potential usage.\n        self.g = ConvModule(\n            self.in_channels,\n            self.inter_channels,\n            kernel_size=1,\n            conv_cfg=conv_cfg,\n            act_cfg=None)  # type: ignore\n        self.conv_out = ConvModule(\n            self.inter_channels,\n            self.in_channels,\n            kernel_size=1,\n            conv_cfg=conv_cfg,\n            norm_cfg=norm_cfg,\n            act_cfg=None)\n\n        if self.mode != 'gaussian':\n            self.theta = ConvModule(\n                self.in_channels,\n                self.inter_channels,\n                kernel_size=1,\n                conv_cfg=conv_cfg,\n                act_cfg=None)\n            self.phi = ConvModule(\n                self.in_channels,\n                self.inter_channels,\n                kernel_size=1,\n                conv_cfg=conv_cfg,\n                act_cfg=None)\n\n        if self.mode == 'concatenation':\n            self.concat_project = ConvModule(\n                self.inter_channels * 2,\n                1,\n                kernel_size=1,\n                stride=1,\n                padding=0,\n                bias=False,\n                act_cfg=dict(type='ReLU'))\n\n        self.init_weights(**kwargs)\n\n    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:\n        if self.mode != 'gaussian':\n            for m in [self.g, self.theta, self.phi]:\n                normal_init(m.conv, std=std)\n        else:\n            normal_init(self.g.conv, std=std)\n        if zeros_init:\n            if self.conv_out.norm_cfg is None:\n                constant_init(self.conv_out.conv, 0)\n            else:\n                constant_init(self.conv_out.norm, 0)\n        else:\n            if self.conv_out.norm_cfg is None:\n                normal_init(self.conv_out.conv, std=std)\n            else:\n                normal_init(self.conv_out.norm, std=std)\n\n    def gaussian(self, theta_x: torch.Tensor,\n                 phi_x: torch.Tensor) -> torch.Tensor:\n        # NonLocal1d pairwise_weight: [N, H, H]\n        # NonLocal2d pairwise_weight: [N, HxW, HxW]\n        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]\n        pairwise_weight = torch.matmul(theta_x, phi_x)\n        pairwise_weight = pairwise_weight.softmax(dim=-1)\n        return pairwise_weight\n\n    def embedded_gaussian(self, theta_x: torch.Tensor,\n                          phi_x: torch.Tensor) -> torch.Tensor:\n        # NonLocal1d pairwise_weight: [N, H, H]\n        # NonLocal2d pairwise_weight: [N, HxW, HxW]\n        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]\n        pairwise_weight = torch.matmul(theta_x, phi_x)\n        if self.use_scale:\n            # theta_x.shape[-1] is `self.inter_channels`\n            pairwise_weight /= theta_x.shape[-1]**0.5\n        pairwise_weight = pairwise_weight.softmax(dim=-1)\n        return pairwise_weight\n\n    def dot_product(self, theta_x: torch.Tensor,\n                    phi_x: torch.Tensor) -> torch.Tensor:\n        # NonLocal1d pairwise_weight: [N, H, H]\n        # NonLocal2d pairwise_weight: [N, HxW, HxW]\n        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]\n        pairwise_weight = torch.matmul(theta_x, phi_x)\n        pairwise_weight /= pairwise_weight.shape[-1]\n        return pairwise_weight\n\n    def concatenation(self, theta_x: torch.Tensor,\n                      phi_x: torch.Tensor) -> torch.Tensor:\n        # NonLocal1d pairwise_weight: [N, H, H]\n        # NonLocal2d pairwise_weight: [N, HxW, HxW]\n        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]\n        h = theta_x.size(2)\n        w = phi_x.size(3)\n        theta_x = theta_x.repeat(1, 1, 1, w)\n        phi_x = phi_x.repeat(1, 1, h, 1)\n\n        concat_feature = torch.cat([theta_x, phi_x], dim=1)\n        pairwise_weight = self.concat_project(concat_feature)\n        n, _, h, w = pairwise_weight.size()\n        pairwise_weight = pairwise_weight.view(n, h, w)\n        pairwise_weight /= pairwise_weight.shape[-1]\n\n        return pairwise_weight\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # Assume `reduction = 1`, then `inter_channels = C`\n        # or `inter_channels = C` when `mode=\"gaussian\"`\n\n        # NonLocal1d x: [N, C, H]\n        # NonLocal2d x: [N, C, H, W]\n        # NonLocal3d x: [N, C, T, H, W]\n        n = x.size(0)\n\n        # NonLocal1d g_x: [N, H, C]\n        # NonLocal2d g_x: [N, HxW, C]\n        # NonLocal3d g_x: [N, TxHxW, C]\n        g_x = self.g(x).view(n, self.inter_channels, -1)\n        g_x = g_x.permute(0, 2, 1)\n\n        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]\n        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]\n        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]\n        if self.mode == 'gaussian':\n            theta_x = x.view(n, self.in_channels, -1)\n            theta_x = theta_x.permute(0, 2, 1)\n            if self.sub_sample:\n                phi_x = self.phi(x).view(n, self.in_channels, -1)\n            else:\n                phi_x = x.view(n, self.in_channels, -1)\n        elif self.mode == 'concatenation':\n            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)\n            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)\n        else:\n            theta_x = self.theta(x).view(n, self.inter_channels, -1)\n            theta_x = theta_x.permute(0, 2, 1)\n            phi_x = self.phi(x).view(n, self.inter_channels, -1)\n\n        pairwise_func = getattr(self, self.mode)\n        # NonLocal1d pairwise_weight: [N, H, H]\n        # NonLocal2d pairwise_weight: [N, HxW, HxW]\n        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]\n        pairwise_weight = pairwise_func(theta_x, phi_x)\n\n        # NonLocal1d y: [N, H, C]\n        # NonLocal2d y: [N, HxW, C]\n        # NonLocal3d y: [N, TxHxW, C]\n        y = torch.matmul(pairwise_weight, g_x)\n        # NonLocal1d y: [N, C, H]\n        # NonLocal2d y: [N, C, H, W]\n        # NonLocal3d y: [N, C, T, H, W]\n        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,\n                                                    *x.size()[2:])\n\n        output = x + self.conv_out(y)\n\n        return output\n\n\nclass NonLocal1d(_NonLocalNd):\n    \"\"\"1D Non-local module.\n\n    Args:\n        in_channels (int): Same as `NonLocalND`.\n        sub_sample (bool): Whether to apply max pooling after pairwise\n            function (Note that the `sub_sample` is applied on spatial only).\n            Default: False.\n        conv_cfg (None | dict): Same as `NonLocalND`.\n            Default: dict(type='Conv1d').\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 sub_sample: bool = False,\n                 conv_cfg: Dict = dict(type='Conv1d'),\n                 **kwargs):\n        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)\n\n        self.sub_sample = sub_sample\n\n        if sub_sample:\n            max_pool_layer = nn.MaxPool1d(kernel_size=2)\n            self.g = nn.Sequential(self.g, max_pool_layer)\n            if self.mode != 'gaussian':\n                self.phi = nn.Sequential(self.phi, max_pool_layer)\n            else:\n                self.phi = max_pool_layer\n\n\n@MODELS.register_module()\nclass NonLocal2d(_NonLocalNd):\n    \"\"\"2D Non-local module.\n\n    Args:\n        in_channels (int): Same as `NonLocalND`.\n        sub_sample (bool): Whether to apply max pooling after pairwise\n            function (Note that the `sub_sample` is applied on spatial only).\n            Default: False.\n        conv_cfg (None | dict): Same as `NonLocalND`.\n            Default: dict(type='Conv2d').\n    \"\"\"\n\n    _abbr_ = 'nonlocal_block'\n\n    def __init__(self,\n                 in_channels: int,\n                 sub_sample: bool = False,\n                 conv_cfg: Dict = dict(type='Conv2d'),\n                 **kwargs):\n        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)\n\n        self.sub_sample = sub_sample\n\n        if sub_sample:\n            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))\n            self.g = nn.Sequential(self.g, max_pool_layer)\n            if self.mode != 'gaussian':\n                self.phi = nn.Sequential(self.phi, max_pool_layer)\n            else:\n                self.phi = max_pool_layer\n\n\nclass NonLocal3d(_NonLocalNd):\n    \"\"\"3D Non-local module.\n\n    Args:\n        in_channels (int): Same as `NonLocalND`.\n        sub_sample (bool): Whether to apply max pooling after pairwise\n            function (Note that the `sub_sample` is applied on spatial only).\n            Default: False.\n        conv_cfg (None | dict): Same as `NonLocalND`.\n            Default: dict(type='Conv3d').\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 sub_sample: bool = False,\n                 conv_cfg: Dict = dict(type='Conv3d'),\n                 **kwargs):\n        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)\n        self.sub_sample = sub_sample\n\n        if sub_sample:\n            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))\n            self.g = nn.Sequential(self.g, max_pool_layer)\n            if self.mode != 'gaussian':\n                self.phi = nn.Sequential(self.phi, max_pool_layer)\n            else:\n                self.phi = max_pool_layer\n"
  },
  {
    "path": "mmcv/cnn/bricks/norm.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nfrom typing import Dict, Tuple, Union\n\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import is_tuple_of\nfrom mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,\n                                                     _InstanceNorm)\n\nMODELS.register_module('BN', module=nn.BatchNorm2d)\nMODELS.register_module('BN1d', module=nn.BatchNorm1d)\nMODELS.register_module('BN2d', module=nn.BatchNorm2d)\nMODELS.register_module('BN3d', module=nn.BatchNorm3d)\nMODELS.register_module('SyncBN', module=SyncBatchNorm)\nMODELS.register_module('GN', module=nn.GroupNorm)\nMODELS.register_module('LN', module=nn.LayerNorm)\nMODELS.register_module('IN', module=nn.InstanceNorm2d)\nMODELS.register_module('IN1d', module=nn.InstanceNorm1d)\nMODELS.register_module('IN2d', module=nn.InstanceNorm2d)\nMODELS.register_module('IN3d', module=nn.InstanceNorm3d)\n\n\ndef infer_abbr(class_type):\n    \"\"\"Infer abbreviation from the class name.\n\n    When we build a norm layer with `build_norm_layer()`, we want to preserve\n    the norm type in variable names, e.g, self.bn1, self.gn. This method will\n    infer the abbreviation to map class types to abbreviations.\n\n    Rule 1: If the class has the property \"_abbr_\", return the property.\n    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or\n    InstanceNorm, the abbreviation of this layer will be \"bn\", \"gn\", \"ln\" and\n    \"in\" respectively.\n    Rule 3: If the class name contains \"batch\", \"group\", \"layer\" or \"instance\",\n    the abbreviation of this layer will be \"bn\", \"gn\", \"ln\" and \"in\"\n    respectively.\n    Rule 4: Otherwise, the abbreviation falls back to \"norm\".\n\n    Args:\n        class_type (type): The norm layer type.\n\n    Returns:\n        str: The inferred abbreviation.\n    \"\"\"\n    if not inspect.isclass(class_type):\n        raise TypeError(\n            f'class_type must be a type, but got {type(class_type)}')\n    if hasattr(class_type, '_abbr_'):\n        return class_type._abbr_\n    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN\n        return 'in'\n    elif issubclass(class_type, _BatchNorm):\n        return 'bn'\n    elif issubclass(class_type, nn.GroupNorm):\n        return 'gn'\n    elif issubclass(class_type, nn.LayerNorm):\n        return 'ln'\n    else:\n        class_name = class_type.__name__.lower()\n        if 'batch' in class_name:\n            return 'bn'\n        elif 'group' in class_name:\n            return 'gn'\n        elif 'layer' in class_name:\n            return 'ln'\n        elif 'instance' in class_name:\n            return 'in'\n        else:\n            return 'norm_layer'\n\n\ndef build_norm_layer(cfg: Dict,\n                     num_features: int,\n                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:\n    \"\"\"Build normalization layer.\n\n    Args:\n        cfg (dict): The norm layer config, which should contain:\n\n            - type (str): Layer type.\n            - layer args: Args needed to instantiate a norm layer.\n            - requires_grad (bool, optional): Whether stop gradient updates.\n        num_features (int): Number of input channels.\n        postfix (int | str): The postfix to be appended into norm abbreviation\n            to create named layer.\n\n    Returns:\n        tuple[str, nn.Module]: The first element is the layer name consisting\n        of abbreviation and postfix, e.g., bn1, gn. The second element is the\n        created norm layer.\n    \"\"\"\n    if not isinstance(cfg, dict):\n        raise TypeError('cfg must be a dict')\n    if 'type' not in cfg:\n        raise KeyError('the cfg dict must contain the key \"type\"')\n    cfg_ = cfg.copy()\n\n    layer_type = cfg_.pop('type')\n\n    if inspect.isclass(layer_type):\n        norm_layer = layer_type\n    else:\n        # Switch registry to the target scope. If `norm_layer` cannot be found\n        # in the registry, fallback to search `norm_layer` in the\n        # mmengine.MODELS.\n        with MODELS.switch_scope_and_registry(None) as registry:\n            norm_layer = registry.get(layer_type)\n        if norm_layer is None:\n            raise KeyError(f'Cannot find {norm_layer} in registry under '\n                           f'scope name {registry.scope}')\n    abbr = infer_abbr(norm_layer)\n\n    assert isinstance(postfix, (int, str))\n    name = abbr + str(postfix)\n\n    requires_grad = cfg_.pop('requires_grad', True)\n    cfg_.setdefault('eps', 1e-5)\n    if norm_layer is not nn.GroupNorm:\n        layer = norm_layer(num_features, **cfg_)\n        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):\n            layer._specify_ddp_gpu_num(1)\n    else:\n        assert 'num_groups' in cfg_\n        layer = norm_layer(num_channels=num_features, **cfg_)\n\n    for param in layer.parameters():\n        param.requires_grad = requires_grad\n\n    return name, layer\n\n\ndef is_norm(layer: nn.Module,\n            exclude: Union[type, tuple, None] = None) -> bool:\n    \"\"\"Check if a layer is a normalization layer.\n\n    Args:\n        layer (nn.Module): The layer to be checked.\n        exclude (type | tuple[type]): Types to be excluded.\n\n    Returns:\n        bool: Whether the layer is a norm layer.\n    \"\"\"\n    if exclude is not None:\n        if not isinstance(exclude, tuple):\n            exclude = (exclude, )\n        if not is_tuple_of(exclude, type):\n            raise TypeError(\n                f'\"exclude\" must be either None or type or a tuple of types, '\n                f'but got {type(exclude)}: {exclude}')\n\n    if exclude and isinstance(layer, exclude):\n        return False\n\n    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)\n    return isinstance(layer, all_norm_bases)\n"
  },
  {
    "path": "mmcv/cnn/bricks/padding.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nfrom typing import Dict\n\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\n\nMODELS.register_module('zero', module=nn.ZeroPad2d)\nMODELS.register_module('reflect', module=nn.ReflectionPad2d)\nMODELS.register_module('replicate', module=nn.ReplicationPad2d)\n\n\ndef build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:\n    \"\"\"Build padding layer.\n\n    Args:\n        cfg (dict): The padding layer config, which should contain:\n            - type (str): Layer type.\n            - layer args: Args needed to instantiate a padding layer.\n\n    Returns:\n        nn.Module: Created padding layer.\n    \"\"\"\n    if not isinstance(cfg, dict):\n        raise TypeError('cfg must be a dict')\n    if 'type' not in cfg:\n        raise KeyError('the cfg dict must contain the key \"type\"')\n\n    cfg_ = cfg.copy()\n    padding_type = cfg_.pop('type')\n    if inspect.isclass(padding_type):\n        return padding_type(*args, **kwargs, **cfg_)\n    # Switch registry to the target scope. If `padding_layer` cannot be found\n    # in the registry, fallback to search `padding_layer` in the\n    # mmengine.MODELS.\n    with MODELS.switch_scope_and_registry(None) as registry:\n        padding_layer = registry.get(padding_type)\n    if padding_layer is None:\n        raise KeyError(f'Cannot find {padding_layer} in registry under scope '\n                       f'name {registry.scope}')\n    layer = padding_layer(*args, **kwargs, **cfg_)\n\n    return layer\n"
  },
  {
    "path": "mmcv/cnn/bricks/plugin.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nimport platform\nfrom typing import Dict, Tuple, Union\n\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\n\nif platform.system() == 'Windows':\n    import regex as re  # type: ignore\nelse:\n    import re  # type: ignore\n\n\ndef infer_abbr(class_type: type) -> str:\n    \"\"\"Infer abbreviation from the class name.\n\n    This method will infer the abbreviation to map class types to\n    abbreviations.\n\n    Rule 1: If the class has the property \"abbr\", return the property.\n    Rule 2: Otherwise, the abbreviation falls back to snake case of class\n    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.\n\n    Args:\n        class_type (type): The norm layer type.\n\n    Returns:\n        str: The inferred abbreviation.\n    \"\"\"\n\n    def camel2snack(word):\n        \"\"\"Convert camel case word into snack case.\n\n        Modified from `inflection lib\n        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.\n\n        Example::\n\n            >>> camel2snack(\"FancyBlock\")\n            'fancy_block'\n        \"\"\"\n\n        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\\1_\\2', word)\n        word = re.sub(r'([a-z\\d])([A-Z])', r'\\1_\\2', word)\n        word = word.replace('-', '_')\n        return word.lower()\n\n    if not inspect.isclass(class_type):\n        raise TypeError(\n            f'class_type must be a type, but got {type(class_type)}')\n    if hasattr(class_type, '_abbr_'):\n        return class_type._abbr_  # type: ignore\n    else:\n        return camel2snack(class_type.__name__)\n\n\ndef build_plugin_layer(cfg: Dict,\n                       postfix: Union[int, str] = '',\n                       **kwargs) -> Tuple[str, nn.Module]:\n    \"\"\"Build plugin layer.\n\n    Args:\n        cfg (dict): cfg should contain:\n\n            - type (str): identify plugin layer type.\n            - layer args: args needed to instantiate a plugin layer.\n        postfix (int, str): appended into norm abbreviation to\n            create named layer. Default: ''.\n\n    Returns:\n        tuple[str, nn.Module]: The first one is the concatenation of\n        abbreviation and postfix. The second is the created plugin layer.\n    \"\"\"\n    if not isinstance(cfg, dict):\n        raise TypeError('cfg must be a dict')\n    if 'type' not in cfg:\n        raise KeyError('the cfg dict must contain the key \"type\"')\n    cfg_ = cfg.copy()\n\n    layer_type = cfg_.pop('type')\n    if inspect.isclass(layer_type):\n        plugin_layer = layer_type\n    else:\n        # Switch registry to the target scope. If `plugin_layer` cannot be\n        # found in the registry, fallback to search `plugin_layer` in the\n        # mmengine.MODELS.\n        with MODELS.switch_scope_and_registry(None) as registry:\n            plugin_layer = registry.get(layer_type)\n        if plugin_layer is None:\n            raise KeyError(\n                f'Cannot find {plugin_layer} in registry under scope '\n                f'name {registry.scope}')\n    abbr = infer_abbr(plugin_layer)\n\n    assert isinstance(postfix, (int, str))\n    name = abbr + str(postfix)\n\n    layer = plugin_layer(**kwargs, **cfg_)\n\n    return name, layer\n"
  },
  {
    "path": "mmcv/cnn/bricks/scale.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\n\n\nclass Scale(nn.Module):\n    \"\"\"A learnable scale parameter.\n\n    This layer scales the input by a learnable factor. It multiplies a\n    learnable scale parameter of shape (1,) with input of any shape.\n\n    Args:\n        scale (float): Initial value of scale factor. Default: 1.0\n    \"\"\"\n\n    def __init__(self, scale: float = 1.0):\n        super().__init__()\n        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return x * self.scale\n\n\nclass LayerScale(nn.Module):\n    \"\"\"LayerScale layer.\n\n    Args:\n        dim (int): Dimension of input features.\n        inplace (bool): Whether performs operation in-place.\n            Default: `False`.\n        data_format (str): The input data format, could be 'channels_last'\n            or 'channels_first', representing (B, C, H, W) and\n            (B, N, C) format data respectively. Default: 'channels_last'.\n        scale (float): Initial value of scale factor. Default: 1.0\n    \"\"\"\n\n    def __init__(self,\n                 dim: int,\n                 inplace: bool = False,\n                 data_format: str = 'channels_last',\n                 scale: float = 1e-5):\n        super().__init__()\n        assert data_format in ('channels_last', 'channels_first'), \\\n            \"'data_format' could only be channels_last or channels_first.\"\n        self.inplace = inplace\n        self.data_format = data_format\n        self.weight = nn.Parameter(torch.ones(dim) * scale)\n\n    def forward(self, x) -> torch.Tensor:\n        if self.data_format == 'channels_first':\n            shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))\n        else:\n            shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))\n        if self.inplace:\n            return x.mul_(self.weight.view(*shape))\n        else:\n            return x * self.weight.view(*shape)\n"
  },
  {
    "path": "mmcv/cnn/bricks/swish.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\n\n\n@MODELS.register_module()\nclass Swish(nn.Module):\n    \"\"\"Swish Module.\n\n    This module applies the swish function:\n\n    .. math::\n        Swish(x) = x * Sigmoid(x)\n\n    Returns:\n        Tensor: The output tensor.\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        return x * torch.sigmoid(x)\n"
  },
  {
    "path": "mmcv/cnn/bricks/transformer.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport math\nimport warnings\nfrom typing import Sequence\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.config import ConfigDict\nfrom mmengine.model import BaseModule, ModuleList, Sequential\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import deprecated_api_warning, to_2tuple\n\nfrom mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,\n                      build_norm_layer)\nfrom .drop import build_dropout\nfrom .scale import LayerScale\n\n# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file\ntry:\n    from mmcv.ops.multi_scale_deform_attn import \\\n        MultiScaleDeformableAttention  # noqa F401\n    warnings.warn(\n        ImportWarning(\n            '``MultiScaleDeformableAttention`` has been moved to '\n            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501\n            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501\n            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501\n        ))\n\nexcept ImportError:\n    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '\n                  '``mmcv.ops.multi_scale_deform_attn``, '\n                  'You should install ``mmcv`` rather than ``mmcv-lite`` '\n                  'if you need this module. ')\n\n\ndef build_positional_encoding(cfg, default_args=None):\n    \"\"\"Builder for Position Encoding.\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n\n\ndef build_attention(cfg, default_args=None):\n    \"\"\"Builder for attention.\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n\n\ndef build_feedforward_network(cfg, default_args=None):\n    \"\"\"Builder for feed-forward network (FFN).\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n\n\ndef build_transformer_layer(cfg, default_args=None):\n    \"\"\"Builder for transformer layer.\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n\n\ndef build_transformer_layer_sequence(cfg, default_args=None):\n    \"\"\"Builder for transformer encoder and transformer decoder.\"\"\"\n    return MODELS.build(cfg, default_args=default_args)\n\n\nclass AdaptivePadding(nn.Module):\n    \"\"\"Applies padding adaptively to the input.\n\n    This module can make input get fully covered by filter\n    you specified. It support two modes \"same\" and \"corner\". The\n    \"same\" mode is same with \"SAME\" padding mode in TensorFlow, pad\n    zero around input. The \"corner\"  mode would pad zero\n    to bottom right.\n\n    Args:\n        kernel_size (int | tuple): Size of the kernel. Default: 1.\n        stride (int | tuple): Stride of the filter. Default: 1.\n        dilation (int | tuple): Spacing between kernel elements.\n            Default: 1.\n        padding (str): Support \"same\" and \"corner\", \"corner\" mode\n            would pad zero to bottom right, and \"same\" mode would\n            pad zero around input. Default: \"corner\".\n\n    Example:\n        >>> kernel_size = 16\n        >>> stride = 16\n        >>> dilation = 1\n        >>> input = torch.rand(1, 1, 15, 17)\n        >>> adap_pad = AdaptivePadding(\n        >>>     kernel_size=kernel_size,\n        >>>     stride=stride,\n        >>>     dilation=dilation,\n        >>>     padding=\"corner\")\n        >>> out = adap_pad(input)\n        >>> assert (out.shape[2], out.shape[3]) == (16, 32)\n        >>> input = torch.rand(1, 1, 16, 17)\n        >>> out = adap_pad(input)\n        >>> assert (out.shape[2], out.shape[3]) == (16, 32)\n    \"\"\"\n\n    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):\n        super().__init__()\n        assert padding in ('same', 'corner')\n\n        kernel_size = to_2tuple(kernel_size)\n        stride = to_2tuple(stride)\n        dilation = to_2tuple(dilation)\n\n        self.padding = padding\n        self.kernel_size = kernel_size\n        self.stride = stride\n        self.dilation = dilation\n\n    def get_pad_shape(self, input_shape):\n        \"\"\"Calculate the padding size of input.\n\n        Args:\n            input_shape (:obj:`torch.Size`): arrange as (H, W).\n\n        Returns:\n            Tuple[int]: The padding size along the\n            original H and W directions\n        \"\"\"\n        input_h, input_w = input_shape\n        kernel_h, kernel_w = self.kernel_size\n        stride_h, stride_w = self.stride\n        output_h = math.ceil(input_h / stride_h)\n        output_w = math.ceil(input_w / stride_w)\n        pad_h = max((output_h - 1) * stride_h +\n                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)\n        pad_w = max((output_w - 1) * stride_w +\n                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)\n        return pad_h, pad_w\n\n    def forward(self, x):\n        \"\"\"Add padding to `x`\n\n        Args:\n            x (Tensor): Input tensor has shape (B, C, H, W).\n\n        Returns:\n            Tensor: The tensor with adaptive padding\n        \"\"\"\n        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])\n        if pad_h > 0 or pad_w > 0:\n            if self.padding == 'corner':\n                x = F.pad(x, [0, pad_w, 0, pad_h])\n            elif self.padding == 'same':\n                x = F.pad(x, [\n                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,\n                    pad_h - pad_h // 2\n                ])\n        return x\n\n\nclass PatchEmbed(BaseModule):\n    \"\"\"Image to Patch Embedding.\n\n    We use a conv layer to implement PatchEmbed.\n\n    Args:\n        in_channels (int): The num of input channels. Default: 3\n        embed_dims (int): The dimensions of embedding. Default: 768\n        conv_type (str): The type of convolution\n            to generate patch embedding. Default: \"Conv2d\".\n        kernel_size (int): The kernel_size of embedding conv. Default: 16.\n        stride (int): The slide stride of embedding conv.\n            Default: 16.\n        padding (int | tuple | string): The padding length of\n            embedding conv. When it is a string, it means the mode\n            of adaptive padding, support \"same\" and \"corner\" now.\n            Default: \"corner\".\n        dilation (int): The dilation rate of embedding conv. Default: 1.\n        bias (bool): Bias of embed conv. Default: True.\n        norm_cfg (dict, optional): Config dict for normalization layer.\n            Default: None.\n        input_size (int | tuple | None): The size of input, which will be\n            used to calculate the out size. Only works when `dynamic_size`\n            is False. Default: None.\n        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.\n            Default: None.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels=3,\n                 embed_dims=768,\n                 conv_type='Conv2d',\n                 kernel_size=16,\n                 stride=16,\n                 padding='corner',\n                 dilation=1,\n                 bias=True,\n                 norm_cfg=None,\n                 input_size=None,\n                 init_cfg=None):\n        super().__init__(init_cfg=init_cfg)\n\n        self.embed_dims = embed_dims\n        if stride is None:\n            stride = kernel_size\n\n        kernel_size = to_2tuple(kernel_size)\n        stride = to_2tuple(stride)\n        dilation = to_2tuple(dilation)\n\n        if isinstance(padding, str):\n            self.adaptive_padding = AdaptivePadding(\n                kernel_size=kernel_size,\n                stride=stride,\n                dilation=dilation,\n                padding=padding)\n            # disable the padding of conv\n            padding = 0\n        else:\n            self.adaptive_padding = None\n        padding = to_2tuple(padding)\n\n        self.projection = build_conv_layer(\n            dict(type=conv_type),\n            in_channels=in_channels,\n            out_channels=embed_dims,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        if norm_cfg is not None:\n            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]\n        else:\n            self.norm = None\n\n        if input_size:\n            input_size = to_2tuple(input_size)\n            # `init_out_size` would be used outside to\n            # calculate the num_patches\n            # e.g. when `use_abs_pos_embed` outside\n            self.init_input_size = input_size\n            if self.adaptive_padding:\n                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)\n                input_h, input_w = input_size\n                input_h = input_h + pad_h\n                input_w = input_w + pad_w\n                input_size = (input_h, input_w)\n\n            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html\n            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *\n                     (kernel_size[0] - 1) - 1) // stride[0] + 1\n            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *\n                     (kernel_size[1] - 1) - 1) // stride[1] + 1\n            self.init_out_size = (h_out, w_out)\n        else:\n            self.init_input_size = None\n            self.init_out_size = None\n\n    def forward(self, x):\n        \"\"\"\n        Args:\n            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.\n\n        Returns:\n            tuple: Contains merged results and its spatial shape.\n\n            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)\n            - out_size (tuple[int]): Spatial shape of x, arrange as\n              (out_h, out_w).\n        \"\"\"\n\n        if self.adaptive_padding:\n            x = self.adaptive_padding(x)\n\n        x = self.projection(x)\n        out_size = (x.shape[2], x.shape[3])\n        x = x.flatten(2).transpose(1, 2)\n        if self.norm is not None:\n            x = self.norm(x)\n        return x, out_size\n\n\nclass PatchMerging(BaseModule):\n    \"\"\"Merge patch feature map.\n\n    This layer groups feature map by kernel_size, and applies norm and linear\n    layers to the grouped feature map ((used in Swin Transformer)).\n    Our implementation uses `nn.Unfold` to\n    merge patches, which is about 25% faster than the original\n    implementation. However, we need to modify pretrained\n    models for compatibility.\n\n    Args:\n        in_channels (int): The num of input channels.\n            to gets fully covered by filter and stride you specified.\n        out_channels (int): The num of output channels.\n        kernel_size (int | tuple, optional): the kernel size in the unfold\n            layer. Defaults to 2.\n        stride (int | tuple, optional): the stride of the sliding blocks in the\n            unfold layer. Default: None. (Would be set as `kernel_size`)\n        padding (int | tuple | string ): The padding length of\n            embedding conv. When it is a string, it means the mode\n            of adaptive padding, support \"same\" and \"corner\" now.\n            Default: \"corner\".\n        dilation (int | tuple, optional): dilation parameter in the unfold\n            layer. Default: 1.\n        bias (bool, optional): Whether to add bias in linear layer or not.\n            Defaults: False.\n        norm_cfg (dict, optional): Config dict for normalization layer.\n            Default: dict(type='LN').\n        init_cfg (dict, optional): The extra config for initialization.\n            Default: None.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size=2,\n                 stride=None,\n                 padding='corner',\n                 dilation=1,\n                 bias=False,\n                 norm_cfg=dict(type='LN'),\n                 init_cfg=None):\n        super().__init__(init_cfg=init_cfg)\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        if stride:\n            stride = stride\n        else:\n            stride = kernel_size\n\n        kernel_size = to_2tuple(kernel_size)\n        stride = to_2tuple(stride)\n        dilation = to_2tuple(dilation)\n\n        if isinstance(padding, str):\n            self.adaptive_padding = AdaptivePadding(\n                kernel_size=kernel_size,\n                stride=stride,\n                dilation=dilation,\n                padding=padding)\n            # disable the padding of unfold\n            padding = 0\n        else:\n            self.adaptive_padding = None\n\n        padding = to_2tuple(padding)\n        self.sampler = nn.Unfold(\n            kernel_size=kernel_size,\n            dilation=dilation,\n            padding=padding,\n            stride=stride)\n\n        sample_dim = kernel_size[0] * kernel_size[1] * in_channels\n\n        if norm_cfg is not None:\n            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]\n        else:\n            self.norm = None\n\n        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)\n\n    def forward(self, x, input_size):\n        \"\"\"\n        Args:\n            x (Tensor): Has shape (B, H*W, C_in).\n            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).\n                Default: None.\n\n        Returns:\n            tuple: Contains merged results and its spatial shape.\n\n            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)\n            - out_size (tuple[int]): Spatial shape of x, arrange as\n              (Merged_H, Merged_W).\n        \"\"\"\n        B, L, C = x.shape\n        assert isinstance(input_size, Sequence), f'Expect ' \\\n                                                 f'input_size is ' \\\n                                                 f'`Sequence` ' \\\n                                                 f'but get {input_size}'\n\n        H, W = input_size\n        assert L == H * W, 'input feature has wrong size'\n\n        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W\n\n        if self.adaptive_padding:\n            x = self.adaptive_padding(x)\n            H, W = x.shape[-2:]\n\n        # Use nn.Unfold to merge patch. About 25% faster than original method,\n        # but need to modify pretrained model for compatibility\n        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)\n        x = self.sampler(x)\n\n        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *\n                 (self.sampler.kernel_size[0] - 1) -\n                 1) // self.sampler.stride[0] + 1\n        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *\n                 (self.sampler.kernel_size[1] - 1) -\n                 1) // self.sampler.stride[1] + 1\n\n        output_size = (out_h, out_w)\n        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C\n        x = self.norm(x) if self.norm else x\n        x = self.reduction(x)\n        return x, output_size\n\n\n@MODELS.register_module()\nclass MultiheadAttention(BaseModule):\n    \"\"\"A wrapper for ``torch.nn.MultiheadAttention``.\n\n    This module implements MultiheadAttention with identity connection,\n    and positional encoding  is also passed as input.\n\n    Args:\n        embed_dims (int): The embedding dimension.\n        num_heads (int): Parallel attention heads.\n        attn_drop (float): A Dropout layer on attn_output_weights.\n            Default: 0.0.\n        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.\n            Default: 0.0.\n        dropout_layer (obj:`ConfigDict`): The dropout_layer used\n            when adding the shortcut.\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n        batch_first (bool): When it is True,  Key, Query and Value are shape of\n            (batch, n, embed_dim), otherwise (n, batch, embed_dim).\n             Default to False.\n    \"\"\"\n\n    def __init__(self,\n                 embed_dims,\n                 num_heads,\n                 attn_drop=0.,\n                 proj_drop=0.,\n                 dropout_layer=dict(type='Dropout', drop_prob=0.),\n                 init_cfg=None,\n                 batch_first=False,\n                 **kwargs):\n        super().__init__(init_cfg)\n        if 'dropout' in kwargs:\n            warnings.warn(\n                'The arguments `dropout` in MultiheadAttention '\n                'has been deprecated, now you can separately '\n                'set `attn_drop`(float), proj_drop(float), '\n                'and `dropout_layer`(dict) ', DeprecationWarning)\n            attn_drop = kwargs['dropout']\n            dropout_layer['drop_prob'] = kwargs.pop('dropout')\n\n        self.embed_dims = embed_dims\n        self.num_heads = num_heads\n        self.batch_first = batch_first\n\n        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,\n                                          **kwargs)\n\n        self.proj_drop = nn.Dropout(proj_drop)\n        self.dropout_layer = build_dropout(\n            dropout_layer) if dropout_layer else nn.Identity()\n\n    @deprecated_api_warning({'residual': 'identity'},\n                            cls_name='MultiheadAttention')\n    def forward(self,\n                query,\n                key=None,\n                value=None,\n                identity=None,\n                query_pos=None,\n                key_pos=None,\n                attn_mask=None,\n                key_padding_mask=None,\n                **kwargs):\n        \"\"\"Forward function for `MultiheadAttention`.\n\n        **kwargs allow passing a more general data flow when combining\n        with other operations in `transformerlayer`.\n\n        Args:\n            query (Tensor): The input query with shape [num_queries, bs,\n                embed_dims] if self.batch_first is False, else\n                [bs, num_queries embed_dims].\n            key (Tensor): The key tensor with shape [num_keys, bs,\n                embed_dims] if self.batch_first is False, else\n                [bs, num_keys, embed_dims] .\n                If None, the ``query`` will be used. Defaults to None.\n            value (Tensor): The value tensor with same shape as `key`.\n                Same in `nn.MultiheadAttention.forward`. Defaults to None.\n                If None, the `key` will be used.\n            identity (Tensor): This tensor, with the same shape as x,\n                will be used for the identity link.\n                If None, `x` will be used. Defaults to None.\n            query_pos (Tensor): The positional encoding for query, with\n                the same shape as `x`. If not None, it will\n                be added to `x` before forward function. Defaults to None.\n            key_pos (Tensor): The positional encoding for `key`, with the\n                same shape as `key`. Defaults to None. If not None, it will\n                be added to `key` before forward function. If None, and\n                `query_pos` has the same shape as `key`, then `query_pos`\n                will be used for `key_pos`. Defaults to None.\n            attn_mask (Tensor): ByteTensor mask with shape [num_queries,\n                num_keys]. Same in `nn.MultiheadAttention.forward`.\n                Defaults to None.\n            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].\n                Defaults to None.\n\n        Returns:\n            Tensor: forwarded results with shape\n            [num_queries, bs, embed_dims]\n            if self.batch_first is False, else\n            [bs, num_queries embed_dims].\n        \"\"\"\n\n        if key is None:\n            key = query\n        if value is None:\n            value = key\n        if identity is None:\n            identity = query\n        if key_pos is None:\n            if query_pos is not None:\n                # use query_pos if key_pos is not available\n                if query_pos.shape == key.shape:\n                    key_pos = query_pos\n                else:\n                    warnings.warn(f'position encoding of key is'\n                                  f'missing in {self.__class__.__name__}.')\n        if query_pos is not None:\n            query = query + query_pos\n        if key_pos is not None:\n            key = key + key_pos\n\n        # Because the dataflow('key', 'query', 'value') of\n        # ``torch.nn.MultiheadAttention`` is (num_query, batch,\n        # embed_dims), We should adjust the shape of dataflow from\n        # batch_first (batch, num_query, embed_dims) to num_query_first\n        # (num_query ,batch, embed_dims), and recover ``attn_output``\n        # from num_query_first to batch_first.\n        if self.batch_first:\n            query = query.transpose(0, 1)\n            key = key.transpose(0, 1)\n            value = value.transpose(0, 1)\n\n        out = self.attn(\n            query=query,\n            key=key,\n            value=value,\n            attn_mask=attn_mask,\n            key_padding_mask=key_padding_mask)[0]\n\n        if self.batch_first:\n            out = out.transpose(0, 1)\n\n        return identity + self.dropout_layer(self.proj_drop(out))\n\n\n@MODELS.register_module()\nclass FFN(BaseModule):\n    \"\"\"Implements feed-forward networks (FFNs) with identity connection.\n\n    Args:\n        embed_dims (int): The feature dimension. Same as\n            `MultiheadAttention`. Defaults: 256.\n        feedforward_channels (int): The hidden dimension of FFNs.\n            Defaults: 1024.\n        num_fcs (int, optional): The number of fully-connected layers in\n            FFNs. Default: 2.\n        act_cfg (dict, optional): The activation config for FFNs.\n            Default: dict(type='ReLU')\n        ffn_drop (float, optional): Probability of an element to be\n            zeroed in FFN. Default 0.0.\n        add_identity (bool, optional): Whether to add the\n            identity connection. Default: `True`.\n        dropout_layer (obj:`ConfigDict`): The dropout_layer used\n            when adding the shortcut.\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n        layer_scale_init_value (float): Initial value of scale factor in\n            LayerScale. Default: 1.0\n    \"\"\"\n\n    @deprecated_api_warning(\n        {\n            'dropout': 'ffn_drop',\n            'add_residual': 'add_identity'\n        },\n        cls_name='FFN')\n    def __init__(self,\n                 embed_dims=256,\n                 feedforward_channels=1024,\n                 num_fcs=2,\n                 act_cfg=dict(type='ReLU', inplace=True),\n                 ffn_drop=0.,\n                 dropout_layer=None,\n                 add_identity=True,\n                 init_cfg=None,\n                 layer_scale_init_value=0.):\n        super().__init__(init_cfg)\n        assert num_fcs >= 2, 'num_fcs should be no less ' \\\n            f'than 2. got {num_fcs}.'\n        self.embed_dims = embed_dims\n        self.feedforward_channels = feedforward_channels\n        self.num_fcs = num_fcs\n\n        layers = []\n        in_channels = embed_dims\n        for _ in range(num_fcs - 1):\n            layers.append(\n                Sequential(\n                    Linear(in_channels, feedforward_channels),\n                    build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))\n            in_channels = feedforward_channels\n        layers.append(Linear(feedforward_channels, embed_dims))\n        layers.append(nn.Dropout(ffn_drop))\n        self.layers = Sequential(*layers)\n        self.dropout_layer = build_dropout(\n            dropout_layer) if dropout_layer else torch.nn.Identity()\n        self.add_identity = add_identity\n\n        if layer_scale_init_value > 0:\n            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)\n        else:\n            self.gamma2 = nn.Identity()\n\n    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')\n    def forward(self, x, identity=None):\n        \"\"\"Forward function for `FFN`.\n\n        The function would add x to the output tensor if residue is None.\n        \"\"\"\n        out = self.layers(x)\n        out = self.gamma2(out)\n        if not self.add_identity:\n            return self.dropout_layer(out)\n        if identity is None:\n            identity = x\n        return identity + self.dropout_layer(out)\n\n\n@MODELS.register_module()\nclass BaseTransformerLayer(BaseModule):\n    \"\"\"Base `TransformerLayer` for vision transformer.\n\n    It can be built from `mmcv.ConfigDict` and support more flexible\n    customization, for example, using any number of `FFN or LN ` and\n    use different kinds of `attention` by specifying a list of `ConfigDict`\n    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`\n    when you specifying `norm` as the first element of `operation_order`.\n    More details about the `prenorm`: `On Layer Normalization in the\n    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .\n\n    Args:\n        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):\n            Configs for `self_attention` or `cross_attention` modules,\n            The order of the configs in the list should be consistent with\n            corresponding attentions in operation_order.\n            If it is a dict, all of the attention modules in operation_order\n            will be built with this config. Default: None.\n        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):\n            Configs for FFN, The order of the configs in the list should be\n            consistent with corresponding ffn in operation_order.\n            If it is a dict, all of the attention modules in operation_order\n            will be built with this config.\n        operation_order (tuple[str]): The execution order of operation\n            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').\n            Support `prenorm` when you specifying first element as `norm`.\n            Default：None.\n        norm_cfg (dict): Config dict for normalization layer.\n            Default: dict(type='LN').\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n        batch_first (bool): Key, Query and Value are shape\n            of (batch, n, embed_dim)\n            or (n, batch, embed_dim). Default to False.\n    \"\"\"\n\n    def __init__(self,\n                 attn_cfgs=None,\n                 ffn_cfgs=dict(\n                     type='FFN',\n                     embed_dims=256,\n                     feedforward_channels=1024,\n                     num_fcs=2,\n                     ffn_drop=0.,\n                     act_cfg=dict(type='ReLU', inplace=True),\n                 ),\n                 operation_order=None,\n                 norm_cfg=dict(type='LN'),\n                 init_cfg=None,\n                 batch_first=False,\n                 **kwargs):\n\n        deprecated_args = dict(\n            feedforward_channels='feedforward_channels',\n            ffn_dropout='ffn_drop',\n            ffn_num_fcs='num_fcs')\n        for ori_name, new_name in deprecated_args.items():\n            if ori_name in kwargs:\n                warnings.warn(\n                    f'The arguments `{ori_name}` in BaseTransformerLayer '\n                    f'has been deprecated, now you should set `{new_name}` '\n                    f'and other FFN related arguments '\n                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)\n                ffn_cfgs[new_name] = kwargs[ori_name]\n\n        super().__init__(init_cfg)\n\n        self.batch_first = batch_first\n\n        assert set(operation_order) & {\n            'self_attn', 'norm', 'ffn', 'cross_attn'} == \\\n            set(operation_order), f'The operation_order of' \\\n            f' {self.__class__.__name__} should ' \\\n            f'contains all four operation type ' \\\n            f\"{['self_attn', 'norm', 'ffn', 'cross_attn']}\"\n\n        num_attn = operation_order.count('self_attn') + operation_order.count(\n            'cross_attn')\n        if isinstance(attn_cfgs, dict):\n            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]\n        else:\n            assert num_attn == len(attn_cfgs), f'The length ' \\\n                f'of attn_cfg {num_attn} is ' \\\n                f'not consistent with the number of attention' \\\n                f'in operation_order {operation_order}.'\n\n        self.num_attn = num_attn\n        self.operation_order = operation_order\n        self.norm_cfg = norm_cfg\n        self.pre_norm = operation_order[0] == 'norm'\n        self.attentions = ModuleList()\n\n        index = 0\n        for operation_name in operation_order:\n            if operation_name in ['self_attn', 'cross_attn']:\n                if 'batch_first' in attn_cfgs[index]:\n                    assert self.batch_first == attn_cfgs[index]['batch_first']\n                else:\n                    attn_cfgs[index]['batch_first'] = self.batch_first\n                attention = build_attention(attn_cfgs[index])\n                # Some custom attentions used as `self_attn`\n                # or `cross_attn` can have different behavior.\n                attention.operation_name = operation_name\n                self.attentions.append(attention)\n                index += 1\n\n        self.embed_dims = self.attentions[0].embed_dims\n\n        self.ffns = ModuleList()\n        num_ffns = operation_order.count('ffn')\n        if isinstance(ffn_cfgs, dict):\n            ffn_cfgs = ConfigDict(ffn_cfgs)\n        if isinstance(ffn_cfgs, dict):\n            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]\n        assert len(ffn_cfgs) == num_ffns\n        for ffn_index in range(num_ffns):\n            if 'embed_dims' not in ffn_cfgs[ffn_index]:\n                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims\n            else:\n                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims\n            self.ffns.append(\n                build_feedforward_network(ffn_cfgs[ffn_index],\n                                          dict(type='FFN')))\n\n        self.norms = ModuleList()\n        num_norms = operation_order.count('norm')\n        for _ in range(num_norms):\n            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])\n\n    def forward(self,\n                query,\n                key=None,\n                value=None,\n                query_pos=None,\n                key_pos=None,\n                attn_masks=None,\n                query_key_padding_mask=None,\n                key_padding_mask=None,\n                **kwargs):\n        \"\"\"Forward function for `TransformerDecoderLayer`.\n\n        **kwargs contains some specific arguments of attentions.\n\n        Args:\n            query (Tensor): The input query with shape\n                [num_queries, bs, embed_dims] if\n                self.batch_first is False, else\n                [bs, num_queries embed_dims].\n            key (Tensor): The key tensor with shape [num_keys, bs,\n                embed_dims] if self.batch_first is False, else\n                [bs, num_keys, embed_dims] .\n            value (Tensor): The value tensor with same shape as `key`.\n            query_pos (Tensor): The positional encoding for `query`.\n                Default: None.\n            key_pos (Tensor): The positional encoding for `key`.\n                Default: None.\n            attn_masks (List[Tensor] | None): 2D Tensor used in\n                calculation of corresponding attention. The length of\n                it should equal to the number of `attention` in\n                `operation_order`. Default: None.\n            query_key_padding_mask (Tensor): ByteTensor for `query`, with\n                shape [bs, num_queries]. Only used in `self_attn` layer.\n                Defaults to None.\n            key_padding_mask (Tensor): ByteTensor for `query`, with\n                shape [bs, num_keys]. Default: None.\n\n        Returns:\n            Tensor: forwarded results with shape [num_queries, bs, embed_dims].\n        \"\"\"\n\n        norm_index = 0\n        attn_index = 0\n        ffn_index = 0\n        identity = query\n        if attn_masks is None:\n            attn_masks = [None for _ in range(self.num_attn)]\n        elif isinstance(attn_masks, torch.Tensor):\n            attn_masks = [\n                copy.deepcopy(attn_masks) for _ in range(self.num_attn)\n            ]\n            warnings.warn(f'Use same attn_mask in all attentions in '\n                          f'{self.__class__.__name__} ')\n        else:\n            assert len(attn_masks) == self.num_attn, f'The length of ' \\\n                        f'attn_masks {len(attn_masks)} must be equal ' \\\n                        f'to the number of attention in ' \\\n                        f'operation_order {self.num_attn}'\n\n        for layer in self.operation_order:\n            if layer == 'self_attn':\n                temp_key = temp_value = query\n                query = self.attentions[attn_index](\n                    query,\n                    temp_key,\n                    temp_value,\n                    identity if self.pre_norm else None,\n                    query_pos=query_pos,\n                    key_pos=query_pos,\n                    attn_mask=attn_masks[attn_index],\n                    key_padding_mask=query_key_padding_mask,\n                    **kwargs)\n                attn_index += 1\n                identity = query\n\n            elif layer == 'norm':\n                query = self.norms[norm_index](query)\n                norm_index += 1\n\n            elif layer == 'cross_attn':\n                query = self.attentions[attn_index](\n                    query,\n                    key,\n                    value,\n                    identity if self.pre_norm else None,\n                    query_pos=query_pos,\n                    key_pos=key_pos,\n                    attn_mask=attn_masks[attn_index],\n                    key_padding_mask=key_padding_mask,\n                    **kwargs)\n                attn_index += 1\n                identity = query\n\n            elif layer == 'ffn':\n                query = self.ffns[ffn_index](\n                    query, identity if self.pre_norm else None)\n                ffn_index += 1\n\n        return query\n\n\n@MODELS.register_module()\nclass TransformerLayerSequence(BaseModule):\n    \"\"\"Base class for TransformerEncoder and TransformerDecoder in vision\n    transformer.\n\n    As base-class of Encoder and Decoder in vision transformer.\n    Support customization such as specifying different kind\n    of `transformer_layer` in `transformer_coder`.\n\n    Args:\n        transformerlayer (list[obj:`mmcv.ConfigDict`] |\n            obj:`mmcv.ConfigDict`): Config of transformerlayer\n            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,\n             it would be repeated `num_layer` times to a\n             list[`mmcv.ConfigDict`]. Default: None.\n        num_layers (int): The number of `TransformerLayer`. Default: None.\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n    \"\"\"\n\n    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):\n        super().__init__(init_cfg)\n        if isinstance(transformerlayers, dict):\n            transformerlayers = [\n                copy.deepcopy(transformerlayers) for _ in range(num_layers)\n            ]\n        else:\n            assert isinstance(transformerlayers, list) and \\\n                   len(transformerlayers) == num_layers\n        self.num_layers = num_layers\n        self.layers = ModuleList()\n        for i in range(num_layers):\n            self.layers.append(build_transformer_layer(transformerlayers[i]))\n        self.embed_dims = self.layers[0].embed_dims\n        self.pre_norm = self.layers[0].pre_norm\n\n    def forward(self,\n                query,\n                key,\n                value,\n                query_pos=None,\n                key_pos=None,\n                attn_masks=None,\n                query_key_padding_mask=None,\n                key_padding_mask=None,\n                **kwargs):\n        \"\"\"Forward function for `TransformerCoder`.\n\n        Args:\n            query (Tensor): Input query with shape\n                `(num_queries, bs, embed_dims)`.\n            key (Tensor): The key tensor with shape\n                `(num_keys, bs, embed_dims)`.\n            value (Tensor): The value tensor with shape\n                `(num_keys, bs, embed_dims)`.\n            query_pos (Tensor): The positional encoding for `query`.\n                Default: None.\n            key_pos (Tensor): The positional encoding for `key`.\n                Default: None.\n            attn_masks (List[Tensor], optional): Each element is 2D Tensor\n                which is used in calculation of corresponding attention in\n                operation_order. Default: None.\n            query_key_padding_mask (Tensor): ByteTensor for `query`, with\n                shape [bs, num_queries]. Only used in self-attention\n                Default: None.\n            key_padding_mask (Tensor): ByteTensor for `query`, with\n                shape [bs, num_keys]. Default: None.\n\n        Returns:\n            Tensor:  results with shape [num_queries, bs, embed_dims].\n        \"\"\"\n        for layer in self.layers:\n            query = layer(\n                query,\n                key,\n                value,\n                query_pos=query_pos,\n                key_pos=key_pos,\n                attn_masks=attn_masks,\n                query_key_padding_mask=query_key_padding_mask,\n                key_padding_mask=key_padding_mask,\n                **kwargs)\n        return query\n"
  },
  {
    "path": "mmcv/cnn/bricks/upsample.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nfrom typing import Dict\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.model import xavier_init\nfrom mmengine.registry import MODELS\n\nMODELS.register_module('nearest', module=nn.Upsample)\nMODELS.register_module('bilinear', module=nn.Upsample)\n\n\n@MODELS.register_module(name='pixel_shuffle')\nclass PixelShufflePack(nn.Module):\n    \"\"\"Pixel Shuffle upsample layer.\n\n    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to\n    achieve a simple upsampling with pixel shuffle.\n\n    Args:\n        in_channels (int): Number of input channels.\n        out_channels (int): Number of output channels.\n        scale_factor (int): Upsample ratio.\n        upsample_kernel (int): Kernel size of the conv layer to expand the\n            channels.\n    \"\"\"\n\n    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,\n                 upsample_kernel: int):\n        super().__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.scale_factor = scale_factor\n        self.upsample_kernel = upsample_kernel\n        self.upsample_conv = nn.Conv2d(\n            self.in_channels,\n            self.out_channels * scale_factor * scale_factor,\n            self.upsample_kernel,\n            padding=(self.upsample_kernel - 1) // 2)\n        self.init_weights()\n\n    def init_weights(self):\n        xavier_init(self.upsample_conv, distribution='uniform')\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        x = self.upsample_conv(x)\n        x = F.pixel_shuffle(x, self.scale_factor)\n        return x\n\n\ndef build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:\n    \"\"\"Build upsample layer.\n\n    Args:\n        cfg (dict): The upsample layer config, which should contain:\n\n            - type (str): Layer type.\n            - scale_factor (int): Upsample ratio, which is not applicable to\n              deconv.\n            - layer args: Args needed to instantiate a upsample layer.\n        args (argument list): Arguments passed to the ``__init__``\n            method of the corresponding conv layer.\n        kwargs (keyword arguments): Keyword arguments passed to the\n            ``__init__`` method of the corresponding conv layer.\n\n    Returns:\n        nn.Module: Created upsample layer.\n    \"\"\"\n    if not isinstance(cfg, dict):\n        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')\n    if 'type' not in cfg:\n        raise KeyError(\n            f'the cfg dict must contain the key \"type\", but got {cfg}')\n    cfg_ = cfg.copy()\n\n    layer_type = cfg_.pop('type')\n\n    if inspect.isclass(layer_type):\n        upsample = layer_type\n    # Switch registry to the target scope. If `upsample` cannot be found\n    # in the registry, fallback to search `upsample` in the\n    # mmengine.MODELS.\n    else:\n        with MODELS.switch_scope_and_registry(None) as registry:\n            upsample = registry.get(layer_type)\n        if upsample is None:\n            raise KeyError(f'Cannot find {upsample} in registry under scope '\n                           f'name {registry.scope}')\n        if upsample is nn.Upsample:\n            cfg_['mode'] = layer_type\n    layer = upsample(*args, **kwargs, **cfg_)\n    return layer\n"
  },
  {
    "path": "mmcv/cnn/bricks/wrappers.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nr\"\"\"Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501\n\nWrap some nn modules to support empty tensor input. Currently, these wrappers\nare mainly used in mask heads like fcn_mask_head and maskiou_heads since mask\nheads are trained on only positive RoIs.\n\"\"\"\nimport math\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\nfrom torch.nn.modules.utils import _pair, _triple\n\nif torch.__version__ == 'parrots':\n    TORCH_VERSION = torch.__version__\nelse:\n    # torch.__version__ could be 1.3.1+cu92, we only need the first two\n    # for comparison\n    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])\n\n\ndef obsolete_torch_version(torch_version, version_threshold) -> bool:\n    return torch_version == 'parrots' or torch_version <= version_threshold\n\n\nclass NewEmptyTensorOp(torch.autograd.Function):\n\n    @staticmethod\n    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:\n        ctx.shape = x.shape\n        return x.new_empty(new_shape)\n\n    @staticmethod\n    def backward(ctx, grad: torch.Tensor) -> tuple:\n        shape = ctx.shape\n        return NewEmptyTensorOp.apply(grad, shape), None\n\n\n@MODELS.register_module('Conv', force=True)\nclass Conv2d(nn.Conv2d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:\n            out_shape = [x.shape[0], self.out_channels]\n            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,\n                                     self.padding, self.stride, self.dilation):\n                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1\n                out_shape.append(o)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            if self.training:\n                # produce dummy gradient to avoid DDP warning.\n                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0\n                return empty + dummy\n            else:\n                return empty\n\n        return super().forward(x)\n\n\n@MODELS.register_module('Conv3d', force=True)\nclass Conv3d(nn.Conv3d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:\n            out_shape = [x.shape[0], self.out_channels]\n            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,\n                                     self.padding, self.stride, self.dilation):\n                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1\n                out_shape.append(o)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            if self.training:\n                # produce dummy gradient to avoid DDP warning.\n                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0\n                return empty + dummy\n            else:\n                return empty\n\n        return super().forward(x)\n\n\n@MODELS.register_module()\n@MODELS.register_module('deconv')\nclass ConvTranspose2d(nn.ConvTranspose2d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:\n            out_shape = [x.shape[0], self.out_channels]\n            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,\n                                         self.padding, self.stride,\n                                         self.dilation, self.output_padding):\n                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            if self.training:\n                # produce dummy gradient to avoid DDP warning.\n                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0\n                return empty + dummy\n            else:\n                return empty\n\n        return super().forward(x)\n\n\n@MODELS.register_module()\n@MODELS.register_module('deconv3d')\nclass ConvTranspose3d(nn.ConvTranspose3d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:\n            out_shape = [x.shape[0], self.out_channels]\n            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,\n                                         self.padding, self.stride,\n                                         self.dilation, self.output_padding):\n                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            if self.training:\n                # produce dummy gradient to avoid DDP warning.\n                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0\n                return empty + dummy\n            else:\n                return empty\n\n        return super().forward(x)\n\n\nclass MaxPool2d(nn.MaxPool2d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # PyTorch 1.9 does not support empty tensor inference yet\n        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:\n            out_shape = list(x.shape[:2])\n            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),\n                                     _pair(self.padding), _pair(self.stride),\n                                     _pair(self.dilation)):\n                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1\n                o = math.ceil(o) if self.ceil_mode else math.floor(o)\n                out_shape.append(o)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            return empty\n\n        return super().forward(x)\n\n\nclass MaxPool3d(nn.MaxPool3d):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # PyTorch 1.9 does not support empty tensor inference yet\n        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:\n            out_shape = list(x.shape[:2])\n            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),\n                                     _triple(self.padding),\n                                     _triple(self.stride),\n                                     _triple(self.dilation)):\n                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1\n                o = math.ceil(o) if self.ceil_mode else math.floor(o)\n                out_shape.append(o)\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            return empty\n\n        return super().forward(x)\n\n\nclass Linear(torch.nn.Linear):\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        # empty tensor forward of Linear layer is supported in Pytorch 1.6\n        if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:\n            out_shape = [x.shape[0], self.out_features]\n            empty = NewEmptyTensorOp.apply(x, out_shape)\n            if self.training:\n                # produce dummy gradient to avoid DDP warning.\n                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0\n                return empty + dummy\n            else:\n                return empty\n\n        return super().forward(x)\n"
  },
  {
    "path": "mmcv/cnn/resnet.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport logging\nfrom typing import Optional, Sequence, Tuple, Union\n\nimport torch.nn as nn\nimport torch.utils.checkpoint as cp\nfrom mmengine.model import constant_init, kaiming_init\nfrom mmengine.runner import load_checkpoint\nfrom torch import Tensor\n\n\ndef conv3x3(in_planes: int,\n            out_planes: int,\n            stride: int = 1,\n            dilation: int = 1):\n    \"\"\"3x3 convolution with padding.\"\"\"\n    return nn.Conv2d(\n        in_planes,\n        out_planes,\n        kernel_size=3,\n        stride=stride,\n        padding=dilation,\n        dilation=dilation,\n        bias=False)\n\n\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n    def __init__(self,\n                 inplanes: int,\n                 planes: int,\n                 stride: int = 1,\n                 dilation: int = 1,\n                 downsample: Optional[nn.Module] = None,\n                 style: str = 'pytorch',\n                 with_cp: bool = False):\n        super().__init__()\n        assert style in ['pytorch', 'caffe']\n        self.conv1 = conv3x3(inplanes, planes, stride, dilation)\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.relu = nn.ReLU(inplace=True)\n        self.conv2 = conv3x3(planes, planes)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n        assert not with_cp\n\n    def forward(self, x: Tensor) -> Tensor:\n        residual = x\n\n        out = self.conv1(x)\n        out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        out = self.bn2(out)\n\n        if self.downsample is not None:\n            residual = self.downsample(x)\n\n        out += residual\n        out = self.relu(out)\n\n        return out\n\n\nclass Bottleneck(nn.Module):\n    expansion = 4\n\n    def __init__(self,\n                 inplanes: int,\n                 planes: int,\n                 stride: int = 1,\n                 dilation: int = 1,\n                 downsample: Optional[nn.Module] = None,\n                 style: str = 'pytorch',\n                 with_cp: bool = False):\n        \"\"\"Bottleneck block.\n\n        If style is \"pytorch\", the stride-two layer is the 3x3 conv layer, if\n        it is \"caffe\", the stride-two layer is the first 1x1 conv layer.\n        \"\"\"\n        super().__init__()\n        assert style in ['pytorch', 'caffe']\n        if style == 'pytorch':\n            conv1_stride = 1\n            conv2_stride = stride\n        else:\n            conv1_stride = stride\n            conv2_stride = 1\n        self.conv1 = nn.Conv2d(\n            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)\n        self.conv2 = nn.Conv2d(\n            planes,\n            planes,\n            kernel_size=3,\n            stride=conv2_stride,\n            padding=dilation,\n            dilation=dilation,\n            bias=False)\n\n        self.bn1 = nn.BatchNorm2d(planes)\n        self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(\n            planes, planes * self.expansion, kernel_size=1, bias=False)\n        self.bn3 = nn.BatchNorm2d(planes * self.expansion)\n        self.relu = nn.ReLU(inplace=True)\n        self.downsample = downsample\n        self.stride = stride\n        self.dilation = dilation\n        self.with_cp = with_cp\n\n    def forward(self, x: Tensor) -> Tensor:\n\n        def _inner_forward(x):\n            residual = x\n\n            out = self.conv1(x)\n            out = self.bn1(out)\n            out = self.relu(out)\n\n            out = self.conv2(out)\n            out = self.bn2(out)\n            out = self.relu(out)\n\n            out = self.conv3(out)\n            out = self.bn3(out)\n\n            if self.downsample is not None:\n                residual = self.downsample(x)\n\n            out += residual\n\n            return out\n\n        if self.with_cp and x.requires_grad:\n            out = cp.checkpoint(_inner_forward, x)\n        else:\n            out = _inner_forward(x)\n\n        out = self.relu(out)\n\n        return out\n\n\ndef make_res_layer(block: nn.Module,\n                   inplanes: int,\n                   planes: int,\n                   blocks: int,\n                   stride: int = 1,\n                   dilation: int = 1,\n                   style: str = 'pytorch',\n                   with_cp: bool = False) -> nn.Module:\n    downsample = None\n    if stride != 1 or inplanes != planes * block.expansion:\n        downsample = nn.Sequential(\n            nn.Conv2d(\n                inplanes,\n                planes * block.expansion,\n                kernel_size=1,\n                stride=stride,\n                bias=False),\n            nn.BatchNorm2d(planes * block.expansion),\n        )\n\n    layers = []\n    layers.append(\n        block(\n            inplanes,\n            planes,\n            stride,\n            dilation,\n            downsample,\n            style=style,\n            with_cp=with_cp))\n    inplanes = planes * block.expansion\n    for _ in range(1, blocks):\n        layers.append(\n            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))\n\n    return nn.Sequential(*layers)\n\n\nclass ResNet(nn.Module):\n    \"\"\"ResNet backbone.\n\n    Args:\n        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.\n        num_stages (int): Resnet stages, normally 4.\n        strides (Sequence[int]): Strides of the first block of each stage.\n        dilations (Sequence[int]): Dilation of each stage.\n        out_indices (Sequence[int]): Output from which stages.\n        style (str): `pytorch` or `caffe`. If set to \"pytorch\", the stride-two\n            layer is the 3x3 conv layer, otherwise the stride-two layer is\n            the first 1x1 conv layer.\n        frozen_stages (int): Stages to be frozen (all param fixed). -1 means\n            not freezing any parameters.\n        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze\n            running stats (mean and var).\n        bn_frozen (bool): Whether to freeze weight and bias of BN layers.\n        with_cp (bool): Use checkpoint or not. Using checkpoint will save some\n            memory while slowing down the training speed.\n    \"\"\"\n\n    arch_settings = {\n        18: (BasicBlock, (2, 2, 2, 2)),\n        34: (BasicBlock, (3, 4, 6, 3)),\n        50: (Bottleneck, (3, 4, 6, 3)),\n        101: (Bottleneck, (3, 4, 23, 3)),\n        152: (Bottleneck, (3, 8, 36, 3))\n    }\n\n    def __init__(self,\n                 depth: int,\n                 num_stages: int = 4,\n                 strides: Sequence[int] = (1, 2, 2, 2),\n                 dilations: Sequence[int] = (1, 1, 1, 1),\n                 out_indices: Sequence[int] = (0, 1, 2, 3),\n                 style: str = 'pytorch',\n                 frozen_stages: int = -1,\n                 bn_eval: bool = True,\n                 bn_frozen: bool = False,\n                 with_cp: bool = False):\n        super().__init__()\n        if depth not in self.arch_settings:\n            raise KeyError(f'invalid depth {depth} for resnet')\n        assert num_stages >= 1 and num_stages <= 4\n        block, stage_blocks = self.arch_settings[depth]\n        stage_blocks = stage_blocks[:num_stages]  # type: ignore\n        assert len(strides) == len(dilations) == num_stages\n        assert max(out_indices) < num_stages\n\n        self.out_indices = out_indices\n        self.style = style\n        self.frozen_stages = frozen_stages\n        self.bn_eval = bn_eval\n        self.bn_frozen = bn_frozen\n        self.with_cp = with_cp\n\n        self.inplanes: int = 64\n        self.conv1 = nn.Conv2d(\n            3, 64, kernel_size=7, stride=2, padding=3, bias=False)\n        self.bn1 = nn.BatchNorm2d(64)\n        self.relu = nn.ReLU(inplace=True)\n        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\n\n        self.res_layers = []\n        for i, num_blocks in enumerate(stage_blocks):\n            stride = strides[i]\n            dilation = dilations[i]\n            planes = 64 * 2**i\n            res_layer = make_res_layer(\n                block,\n                self.inplanes,\n                planes,\n                num_blocks,\n                stride=stride,\n                dilation=dilation,\n                style=self.style,\n                with_cp=with_cp)\n            self.inplanes = planes * block.expansion  # type: ignore\n            layer_name = f'layer{i + 1}'\n            self.add_module(layer_name, res_layer)\n            self.res_layers.append(layer_name)\n\n        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore\n            len(stage_blocks) - 1)\n\n    def init_weights(self, pretrained: Optional[str] = None) -> None:\n        if isinstance(pretrained, str):\n            logger = logging.getLogger()\n            load_checkpoint(self, pretrained, strict=False, logger=logger)\n        elif pretrained is None:\n            for m in self.modules():\n                if isinstance(m, nn.Conv2d):\n                    kaiming_init(m)\n                elif isinstance(m, nn.BatchNorm2d):\n                    constant_init(m, 1)\n        else:\n            raise TypeError('pretrained must be a str or None')\n\n    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:\n        x = self.conv1(x)\n        x = self.bn1(x)\n        x = self.relu(x)\n        x = self.maxpool(x)\n        outs = []\n        for i, layer_name in enumerate(self.res_layers):\n            res_layer = getattr(self, layer_name)\n            x = res_layer(x)\n            if i in self.out_indices:\n                outs.append(x)\n        if len(outs) == 1:\n            return outs[0]\n        else:\n            return tuple(outs)\n\n    def train(self, mode: bool = True) -> None:\n        super().train(mode)\n        if self.bn_eval:\n            for m in self.modules():\n                if isinstance(m, nn.BatchNorm2d):\n                    m.eval()\n                    if self.bn_frozen:\n                        for params in m.parameters():\n                            params.requires_grad = False\n        if mode and self.frozen_stages >= 0:\n            for param in self.conv1.parameters():\n                param.requires_grad = False\n            for param in self.bn1.parameters():\n                param.requires_grad = False\n            self.bn1.eval()\n            self.bn1.weight.requires_grad = False\n            self.bn1.bias.requires_grad = False\n            for i in range(1, self.frozen_stages + 1):\n                mod = getattr(self, f'layer{i}')\n                mod.eval()\n                for param in mod.parameters():\n                    param.requires_grad = False\n"
  },
  {
    "path": "mmcv/cnn/rfsearch/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .operator import BaseConvRFSearchOp, Conv2dRFSearchOp\nfrom .search import RFSearchHook\n\n__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']\n"
  },
  {
    "path": "mmcv/cnn/rfsearch/operator.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom mmengine.logging import print_log\nfrom mmengine.model import BaseModule\nfrom torch import Tensor\n\nfrom .utils import expand_rates, get_single_padding\n\n\nclass BaseConvRFSearchOp(BaseModule):\n    \"\"\"Based class of ConvRFSearchOp.\n\n    Args:\n        op_layer (nn.Module): pytorch module, e,g, Conv2d\n        global_config (dict): config dict.\n    \"\"\"\n\n    def __init__(self, op_layer: nn.Module, global_config: dict):\n        super().__init__()\n        self.op_layer = op_layer\n        self.global_config = global_config\n\n    def normlize(self, weights: nn.Parameter) -> nn.Parameter:\n        \"\"\"Normalize weights.\n\n        Args:\n            weights (nn.Parameter): Weights to be normalized.\n\n        Returns:\n            nn.Parameters: Normalized weights.\n        \"\"\"\n        abs_weights = torch.abs(weights)\n        normalized_weights = abs_weights / torch.sum(abs_weights)\n        return normalized_weights\n\n\nclass Conv2dRFSearchOp(BaseConvRFSearchOp):\n    \"\"\"Enable Conv2d with receptive field searching ability.\n\n    Args:\n        op_layer (nn.Module): pytorch module, e,g, Conv2d\n        global_config (dict): config dict. Defaults to None.\n            By default this must include:\n\n            - \"init_alphas\": The value for initializing weights of each branch.\n            - \"num_branches\": The controller of the size of\n              search space (the number of branches).\n            - \"exp_rate\": The controller of the sparsity of search space.\n            - \"mmin\": The minimum dilation rate.\n            - \"mmax\": The maximum dilation rate.\n\n            Extra keys may exist, but are used by RFSearchHook, e.g., \"step\",\n            \"max_step\", \"search_interval\", and \"skip_layer\".\n        verbose (bool): Determines whether to print rf-next\n            related logging messages.\n            Defaults to True.\n    \"\"\"\n\n    def __init__(self,\n                 op_layer: nn.Module,\n                 global_config: dict,\n                 verbose: bool = True):\n        super().__init__(op_layer, global_config)\n        assert global_config is not None, 'global_config is None'\n        self.num_branches = global_config['num_branches']\n        assert self.num_branches in [2, 3]\n        self.verbose = verbose\n        init_dilation = op_layer.dilation\n        self.dilation_rates = expand_rates(init_dilation, global_config)\n        if self.op_layer.kernel_size[\n                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:\n            self.dilation_rates = [(op_layer.dilation[0], r[1])\n                                   for r in self.dilation_rates]\n        if self.op_layer.kernel_size[\n                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:\n            self.dilation_rates = [(r[0], op_layer.dilation[1])\n                                   for r in self.dilation_rates]\n\n        self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))\n        if self.verbose:\n            print_log(f'Expand as {self.dilation_rates}', 'current')\n        nn.init.constant_(self.branch_weights, global_config['init_alphas'])\n\n    def forward(self, input: Tensor) -> Tensor:\n        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])\n        if len(self.dilation_rates) == 1:\n            outputs = [\n                nn.functional.conv2d(\n                    input,\n                    weight=self.op_layer.weight,\n                    bias=self.op_layer.bias,\n                    stride=self.op_layer.stride,\n                    padding=self.get_padding(self.dilation_rates[0]),\n                    dilation=self.dilation_rates[0],\n                    groups=self.op_layer.groups,\n                )\n            ]\n        else:\n            outputs = [\n                nn.functional.conv2d(\n                    input,\n                    weight=self.op_layer.weight,\n                    bias=self.op_layer.bias,\n                    stride=self.op_layer.stride,\n                    padding=self.get_padding(r),\n                    dilation=r,\n                    groups=self.op_layer.groups,\n                ) * norm_w[i] for i, r in enumerate(self.dilation_rates)\n            ]\n        output = outputs[0]\n        for i in range(1, len(self.dilation_rates)):\n            output += outputs[i]\n        return output\n\n    def estimate_rates(self) -> None:\n        \"\"\"Estimate new dilation rate based on trained branch_weights.\"\"\"\n        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])\n        if self.verbose:\n            print_log(\n                'Estimate dilation {} with weight {}.'.format(\n                    self.dilation_rates,\n                    norm_w.detach().cpu().numpy().tolist()), 'current')\n\n        sum0, sum1, w_sum = 0, 0, 0\n        for i in range(len(self.dilation_rates)):\n            sum0 += norm_w[i].item() * self.dilation_rates[i][0]\n            sum1 += norm_w[i].item() * self.dilation_rates[i][1]\n            w_sum += norm_w[i].item()\n        estimated = [\n            np.clip(\n                int(round(sum0 / w_sum)), self.global_config['mmin'],\n                self.global_config['mmax']).item(),\n            np.clip(\n                int(round(sum1 / w_sum)), self.global_config['mmin'],\n                self.global_config['mmax']).item()\n        ]\n        self.op_layer.dilation = tuple(estimated)\n        self.op_layer.padding = self.get_padding(self.op_layer.dilation)\n        self.dilation_rates = [tuple(estimated)]\n        if self.verbose:\n            print_log(f'Estimate as {tuple(estimated)}', 'current')\n\n    def expand_rates(self) -> None:\n        \"\"\"Expand dilation rate.\"\"\"\n        dilation = self.op_layer.dilation\n        dilation_rates = expand_rates(dilation, self.global_config)\n        if self.op_layer.kernel_size[\n                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:\n            dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]\n        if self.op_layer.kernel_size[\n                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:\n            dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]\n\n        self.dilation_rates = copy.deepcopy(dilation_rates)\n        if self.verbose:\n            print_log(f'Expand as {self.dilation_rates}', 'current')\n        nn.init.constant_(self.branch_weights,\n                          self.global_config['init_alphas'])\n\n    def get_padding(self, dilation) -> tuple:\n        padding = (get_single_padding(self.op_layer.kernel_size[0],\n                                      self.op_layer.stride[0], dilation[0]),\n                   get_single_padding(self.op_layer.kernel_size[1],\n                                      self.op_layer.stride[1], dilation[1]))\n        return padding\n"
  },
  {
    "path": "mmcv/cnn/rfsearch/search.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nfrom typing import Dict, Optional\n\nimport mmengine\nimport torch  # noqa\nimport torch.nn as nn\nfrom mmengine.hooks import Hook\nfrom mmengine.logging import print_log\nfrom mmengine.registry import HOOKS\n\nfrom .operator import BaseConvRFSearchOp, Conv2dRFSearchOp  # noqa\nfrom .utils import get_single_padding, write_to_json\n\n\n@HOOKS.register_module()\nclass RFSearchHook(Hook):\n    \"\"\"Rcecptive field search via dilation rates.\n\n    Please refer to `RF-Next: Efficient Receptive Field\n    Search for Convolutional Neural Networks\n    <https://arxiv.org/abs/2206.06637>`_ for more details.\n\n\n    Args:\n        mode (str, optional): It can be set to the following types:\n            'search', 'fixed_single_branch', or 'fixed_multi_branch'.\n            Defaults to 'search'.\n        config (Dict, optional): config dict of search.\n            By default this config contains \"search\",\n            and config[\"search\"] must include:\n\n            - \"step\": recording the current searching step.\n            - \"max_step\": The maximum number of searching steps\n              to update the structures.\n            - \"search_interval\": The interval (epoch/iteration)\n              between two updates.\n            - \"exp_rate\": The controller of the sparsity of search space.\n            - \"init_alphas\": The value for initializing weights of each branch.\n            - \"mmin\": The minimum dilation rate.\n            - \"mmax\": The maximum dilation rate.\n            - \"num_branches\": The controller of the size of\n              search space (the number of branches).\n            - \"skip_layer\": The modules in skip_layer will be ignored\n              during the receptive field search.\n        rfstructure_file (str, optional): Path to load searched receptive\n            fields of the model. Defaults to None.\n        by_epoch (bool, optional): Determine to perform step by epoch or\n            by iteration. If set to True, it will step by epoch. Otherwise, by\n            iteration. Defaults to True.\n        verbose (bool): Determines whether to print rf-next related logging\n            messages. Defaults to True.\n    \"\"\"\n\n    def __init__(self,\n                 mode: str = 'search',\n                 config: Dict = {},\n                 rfstructure_file: Optional[str] = None,\n                 by_epoch: bool = True,\n                 verbose: bool = True):\n        assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']\n        assert config is not None\n        self.config = config\n        self.config['structure'] = {}\n        self.verbose = verbose\n        if rfstructure_file is not None:\n            rfstructure = mmengine.load(rfstructure_file)['structure']\n            self.config['structure'] = rfstructure\n        self.mode = mode\n        self.num_branches = self.config['search']['num_branches']\n        self.by_epoch = by_epoch\n\n    def init_model(self, model: nn.Module):\n        \"\"\"Init model with search ability.\n\n        Args:\n            model (nn.Module): pytorch model\n\n        Raises:\n            NotImplementedError: only support three modes:\n                search/fixed_single_branch/fixed_multi_branch\n        \"\"\"\n        if self.verbose:\n            print_log('RFSearch init begin.', 'current')\n        if self.mode == 'search':\n            if self.config['structure']:\n                self.set_model(model, search_op='Conv2d')\n            self.wrap_model(model, search_op='Conv2d')\n        elif self.mode == 'fixed_single_branch':\n            self.set_model(model, search_op='Conv2d')\n        elif self.mode == 'fixed_multi_branch':\n            self.set_model(model, search_op='Conv2d')\n            self.wrap_model(model, search_op='Conv2d')\n        else:\n            raise NotImplementedError\n        if self.verbose:\n            print_log('RFSearch init end.', 'current')\n\n    def after_train_epoch(self, runner):\n        \"\"\"Performs a dilation searching step after one training epoch.\"\"\"\n        if self.by_epoch and self.mode == 'search':\n            self.step(runner.model, runner.work_dir)\n\n    def after_train_iter(self, runner, batch_idx, data_batch, outputs):\n        \"\"\"Performs a dilation searching step after one training iteration.\"\"\"\n        if not self.by_epoch and self.mode == 'search':\n            self.step(runner.model, runner.work_dir)\n\n    def step(self, model: nn.Module, work_dir: str) -> None:\n        \"\"\"Performs a dilation searching step.\n\n        Args:\n            model (nn.Module): pytorch model\n            work_dir (str): Directory to save the searching results.\n        \"\"\"\n        self.config['search']['step'] += 1\n        if (self.config['search']['step']\n            ) % self.config['search']['search_interval'] == 0 and (self.config[\n                'search']['step']) < self.config['search']['max_step']:\n            self.estimate_and_expand(model)\n            for name, module in model.named_modules():\n                if isinstance(module, BaseConvRFSearchOp):\n                    self.config['structure'][name] = module.op_layer.dilation\n\n            write_to_json(\n                self.config,\n                os.path.join(\n                    work_dir,\n                    'local_search_config_step%d.json' %\n                    self.config['search']['step'],\n                ),\n            )\n\n    def estimate_and_expand(self, model: nn.Module) -> None:\n        \"\"\"Estimate and search for RFConvOp.\n\n        Args:\n            model (nn.Module): pytorch model\n        \"\"\"\n        for module in model.modules():\n            if isinstance(module, BaseConvRFSearchOp):\n                module.estimate_rates()\n                module.expand_rates()\n\n    def wrap_model(self,\n                   model: nn.Module,\n                   search_op: str = 'Conv2d',\n                   prefix: str = '') -> None:\n        \"\"\"Wrap model to support searchable conv op.\n\n        Args:\n            model (nn.Module): pytorch model\n            search_op (str): The module that uses RF search.\n                Defaults to 'Conv2d'.\n            init_rates (int, optional): Set to other initial dilation rates.\n                Defaults to None.\n            prefix (str): Prefix for function recursion. Defaults to ''.\n        \"\"\"\n        op = 'torch.nn.' + search_op\n        for name, module in model.named_children():\n            if prefix == '':\n                fullname = 'module.' + name\n            else:\n                fullname = prefix + '.' + name\n            if self.config['search']['skip_layer'] is not None:\n                if any(layer in fullname\n                       for layer in self.config['search']['skip_layer']):\n                    continue\n            if isinstance(module, eval(op)):\n                if 1 < module.kernel_size[0] and \\\n                    0 != module.kernel_size[0] % 2 or \\\n                    1 < module.kernel_size[1] and \\\n                        0 != module.kernel_size[1] % 2:\n                    moduleWrap = eval(search_op + 'RFSearchOp')(\n                        module, self.config['search'], self.verbose)\n                    moduleWrap = moduleWrap.to(module.weight.device)\n                    if self.verbose:\n                        print_log(\n                            'Wrap model %s to %s.' %\n                            (str(module), str(moduleWrap)), 'current')\n                    setattr(model, name, moduleWrap)\n            elif not isinstance(module, BaseConvRFSearchOp):\n                self.wrap_model(module, search_op, fullname)\n\n    def set_model(self,\n                  model: nn.Module,\n                  search_op: str = 'Conv2d',\n                  init_rates: Optional[int] = None,\n                  prefix: str = '') -> None:\n        \"\"\"Set model based on config.\n\n        Args:\n            model (nn.Module): pytorch model\n            config (Dict): config file\n            search_op (str): The module that uses RF search.\n                Defaults to 'Conv2d'.\n            init_rates (int, optional):  Set to other initial dilation rates.\n                Defaults to None.\n            prefix (str): Prefix for function recursion. Defaults to ''.\n        \"\"\"\n        op = 'torch.nn.' + search_op\n        for name, module in model.named_children():\n            if prefix == '':\n                fullname = 'module.' + name\n            else:\n                fullname = prefix + '.' + name\n            if self.config['search']['skip_layer'] is not None:\n                if any(layer in fullname\n                       for layer in self.config['search']['skip_layer']):\n                    continue\n            if isinstance(module, eval(op)):\n                if 1 < module.kernel_size[0] and \\\n                    0 != module.kernel_size[0] % 2 or \\\n                    1 < module.kernel_size[1] and \\\n                        0 != module.kernel_size[1] % 2:\n                    if isinstance(self.config['structure'][fullname], int):\n                        self.config['structure'][fullname] = [\n                            self.config['structure'][fullname],\n                            self.config['structure'][fullname]\n                        ]\n                    module.dilation = (\n                        self.config['structure'][fullname][0],\n                        self.config['structure'][fullname][1],\n                    )\n                    module.padding = (\n                        get_single_padding(\n                            module.kernel_size[0], module.stride[0],\n                            self.config['structure'][fullname][0]),\n                        get_single_padding(\n                            module.kernel_size[1], module.stride[1],\n                            self.config['structure'][fullname][1]))\n                    setattr(model, name, module)\n                    if self.verbose:\n                        print_log(\n                            'Set module %s dilation as: [%d %d]' %\n                            (fullname, module.dilation[0], module.dilation[1]),\n                            'current')\n            elif not isinstance(module, BaseConvRFSearchOp):\n                self.set_model(module, search_op, init_rates, fullname)\n"
  },
  {
    "path": "mmcv/cnn/rfsearch/utils.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport mmengine\nimport numpy as np\n\n\ndef write_to_json(config: dict, filename: str):\n    \"\"\"Save config to json file.\n\n    Args:\n        config (dict): Config to be saved.\n        filename (str): Path to save config.\n    \"\"\"\n\n    with open(filename, 'w', encoding='utf-8') as f:\n        mmengine.dump(config, f, file_format='json')\n\n\ndef expand_rates(dilation: tuple, config: dict) -> list:\n    \"\"\"Expand dilation rate according to config.\n\n    Args:\n        dilation (int): _description_\n        config (dict): config dict\n\n    Returns:\n        list: list of expanded dilation rates\n    \"\"\"\n    exp_rate = config['exp_rate']\n\n    large_rates = []\n    small_rates = []\n    for _ in range(config['num_branches'] // 2):\n        large_rates.append(\n            tuple([\n                np.clip(\n                    int(round((1 + exp_rate) * dilation[0])), config['mmin'],\n                    config['mmax']).item(),\n                np.clip(\n                    int(round((1 + exp_rate) * dilation[1])), config['mmin'],\n                    config['mmax']).item()\n            ]))\n        small_rates.append(\n            tuple([\n                np.clip(\n                    int(round((1 - exp_rate) * dilation[0])), config['mmin'],\n                    config['mmax']).item(),\n                np.clip(\n                    int(round((1 - exp_rate) * dilation[1])), config['mmin'],\n                    config['mmax']).item()\n            ]))\n\n    small_rates.reverse()\n\n    if config['num_branches'] % 2 == 0:\n        rate_list = small_rates + large_rates\n    else:\n        rate_list = small_rates + [dilation] + large_rates\n\n    unique_rate_list = list(set(rate_list))\n    unique_rate_list.sort(key=rate_list.index)\n    return unique_rate_list\n\n\ndef get_single_padding(kernel_size: int,\n                       stride: int = 1,\n                       dilation: int = 1) -> int:\n    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2\n    return padding\n"
  },
  {
    "path": "mmcv/cnn/utils/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .flops_counter import get_model_complexity_info\nfrom .fuse_conv_bn import fuse_conv_bn\n\n__all__ = ['get_model_complexity_info', 'fuse_conv_bn']\n"
  },
  {
    "path": "mmcv/cnn/utils/flops_counter.py",
    "content": "# Modified from flops-counter.pytorch by Vladislav Sovrasov\n# original repo: https://github.com/sovrasov/flops-counter.pytorch\n\n# MIT License\n\n# Copyright (c) 2018 Vladislav Sovrasov\n\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n\n# The above copyright notice and this permission notice shall be included in\n# all copies or substantial portions of the Software.\n\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport sys\nimport warnings\nfrom functools import partial\nfrom typing import Any, Callable, Dict, Optional, TextIO, Tuple\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,\n                             MaxPool2d, MaxPool3d)\n\n\ndef get_model_complexity_info(model: nn.Module,\n                              input_shape: tuple,\n                              print_per_layer_stat: bool = True,\n                              as_strings: bool = True,\n                              input_constructor: Optional[Callable] = None,\n                              flush: bool = False,\n                              ost: TextIO = sys.stdout) -> tuple:\n    \"\"\"Get complexity information of a model.\n\n    This method can calculate FLOPs and parameter counts of a model with\n    corresponding input shape. It can also print complexity information for\n    each layer in a model.\n\n    Supported layers are listed as below:\n        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.\n        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,\n          ``nn.LeakyReLU``, ``nn.ReLU6``.\n        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,\n          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,\n          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,\n          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,\n          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.\n        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,\n          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,\n          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.\n        - Linear: ``nn.Linear``.\n        - Deconvolution: ``nn.ConvTranspose2d``.\n        - Upsample: ``nn.Upsample``.\n\n    Args:\n        model (nn.Module): The model for complexity calculation.\n        input_shape (tuple): Input shape used for calculation.\n        print_per_layer_stat (bool): Whether to print complexity information\n            for each layer in a model. Default: True.\n        as_strings (bool): Output FLOPs and params counts in a string form.\n            Default: True.\n        input_constructor (None | callable): If specified, it takes a callable\n            method that generates input. otherwise, it will generate a random\n            tensor with input shape to calculate FLOPs. Default: None.\n        flush (bool): same as that in :func:`print`. Default: False.\n        ost (stream): same as ``file`` param in :func:`print`.\n            Default: sys.stdout.\n\n    Returns:\n        tuple[float | str]: If ``as_strings`` is set to True, it will return\n        FLOPs and parameter counts in a string format. otherwise, it will\n        return those in a float number format.\n    \"\"\"\n    assert type(input_shape) is tuple\n    assert len(input_shape) >= 1\n    assert isinstance(model, nn.Module)\n    flops_model = add_flops_counting_methods(model)\n    flops_model.eval()\n    flops_model.start_flops_count()\n    if input_constructor:\n        input = input_constructor(input_shape)\n        _ = flops_model(**input)\n    else:\n        try:\n            batch = torch.ones(()).new_empty(\n                (1, *input_shape),\n                dtype=next(flops_model.parameters()).dtype,\n                device=next(flops_model.parameters()).device)\n        except StopIteration:\n            # Avoid StopIteration for models which have no parameters,\n            # like `nn.Relu()`, `nn.AvgPool2d`, etc.\n            batch = torch.ones(()).new_empty((1, *input_shape))\n\n        _ = flops_model(batch)\n\n    flops_count, params_count = flops_model.compute_average_flops_cost()\n    if print_per_layer_stat:\n        print_model_with_flops(\n            flops_model, flops_count, params_count, ost=ost, flush=flush)\n    flops_model.stop_flops_count()\n\n    if as_strings:\n        return flops_to_string(flops_count), params_to_string(params_count)\n\n    return flops_count, params_count\n\n\ndef flops_to_string(flops: float,\n                    units: Optional[str] = 'GFLOPs',\n                    precision: int = 2) -> str:\n    \"\"\"Convert FLOPs number into a string.\n\n    Note that Here we take a multiply-add counts as one FLOP.\n\n    Args:\n        flops (float): FLOPs number to be converted.\n        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',\n            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically\n            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.\n        precision (int): Digit number after the decimal point. Default: 2.\n\n    Returns:\n        str: The converted FLOPs number with units.\n\n    Examples:\n        >>> flops_to_string(1e9)\n        '1.0 GFLOPs'\n        >>> flops_to_string(2e5, 'MFLOPs')\n        '0.2 MFLOPs'\n        >>> flops_to_string(3e-9, None)\n        '3e-09 FLOPs'\n    \"\"\"\n    if units is None:\n        if flops // 10**9 > 0:\n            return str(round(flops / 10.**9, precision)) + ' GFLOPs'\n        elif flops // 10**6 > 0:\n            return str(round(flops / 10.**6, precision)) + ' MFLOPs'\n        elif flops // 10**3 > 0:\n            return str(round(flops / 10.**3, precision)) + ' KFLOPs'\n        else:\n            return str(flops) + ' FLOPs'\n    else:\n        if units == 'GFLOPs':\n            return str(round(flops / 10.**9, precision)) + ' ' + units\n        elif units == 'MFLOPs':\n            return str(round(flops / 10.**6, precision)) + ' ' + units\n        elif units == 'KFLOPs':\n            return str(round(flops / 10.**3, precision)) + ' ' + units\n        else:\n            return str(flops) + ' FLOPs'\n\n\ndef params_to_string(num_params: float,\n                     units: Optional[str] = None,\n                     precision: int = 2) -> str:\n    \"\"\"Convert parameter number into a string.\n\n    Args:\n        num_params (float): Parameter number to be converted.\n        units (str | None): Converted FLOPs units. Options are None, 'M',\n            'K' and ''. If set to None, it will automatically choose the most\n            suitable unit for Parameter number. Default: None.\n        precision (int): Digit number after the decimal point. Default: 2.\n\n    Returns:\n        str: The converted parameter number with units.\n\n    Examples:\n        >>> params_to_string(1e9)\n        '1000.0 M'\n        >>> params_to_string(2e5)\n        '200.0 k'\n        >>> params_to_string(3e-9)\n        '3e-09'\n    \"\"\"\n    if units is None:\n        if num_params // 10**6 > 0:\n            return str(round(num_params / 10**6, precision)) + ' M'\n        elif num_params // 10**3:\n            return str(round(num_params / 10**3, precision)) + ' k'\n        else:\n            return str(num_params)\n    else:\n        if units == 'M':\n            return str(round(num_params / 10.**6, precision)) + ' ' + units\n        elif units == 'K':\n            return str(round(num_params / 10.**3, precision)) + ' ' + units\n        else:\n            return str(num_params)\n\n\ndef print_model_with_flops(model: nn.Module,\n                           total_flops: float,\n                           total_params: float,\n                           units: Optional[str] = 'GFLOPs',\n                           precision: int = 3,\n                           ost: TextIO = sys.stdout,\n                           flush: bool = False) -> None:\n    \"\"\"Print a model with FLOPs for each layer.\n\n    Args:\n        model (nn.Module): The model to be printed.\n        total_flops (float): Total FLOPs of the model.\n        total_params (float): Total parameter counts of the model.\n        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.\n        precision (int): Digit number after the decimal point. Default: 3.\n        ost (stream): same as `file` param in :func:`print`.\n            Default: sys.stdout.\n        flush (bool): same as that in :func:`print`. Default: False.\n\n    Example:\n        >>> class ExampleModel(nn.Module):\n\n        >>> def __init__(self):\n        >>>     super().__init__()\n        >>>     self.conv1 = nn.Conv2d(3, 8, 3)\n        >>>     self.conv2 = nn.Conv2d(8, 256, 3)\n        >>>     self.conv3 = nn.Conv2d(256, 8, 3)\n        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))\n        >>>     self.flatten = nn.Flatten()\n        >>>     self.fc = nn.Linear(8, 1)\n\n        >>> def forward(self, x):\n        >>>     x = self.conv1(x)\n        >>>     x = self.conv2(x)\n        >>>     x = self.conv3(x)\n        >>>     x = self.avg_pool(x)\n        >>>     x = self.flatten(x)\n        >>>     x = self.fc(x)\n        >>>     return x\n\n        >>> model = ExampleModel()\n        >>> x = (3, 16, 16)\n        to print the complexity information state for each layer, you can use\n        >>> get_model_complexity_info(model, x)\n        or directly use\n        >>> print_model_with_flops(model, 4579784.0, 37361)\n        ExampleModel(\n          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,\n          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501\n          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))\n          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))\n          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))\n          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )\n          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)\n        )\n    \"\"\"\n\n    def accumulate_params(self):\n        if is_supported_instance(self):\n            return self.__params__\n        else:\n            sum = 0\n            for m in self.children():\n                sum += m.accumulate_params()\n            return sum\n\n    def accumulate_flops(self):\n        if is_supported_instance(self):\n            return self.__flops__ / model.__batch_counter__\n        else:\n            sum = 0\n            for m in self.children():\n                sum += m.accumulate_flops()\n            return sum\n\n    def flops_repr(self):\n        accumulated_num_params = self.accumulate_params()\n        accumulated_flops_cost = self.accumulate_flops()\n        return ', '.join([\n            params_to_string(\n                accumulated_num_params, units='M', precision=precision),\n            f'{accumulated_num_params / total_params:.3%} Params',\n            flops_to_string(\n                accumulated_flops_cost, units=units, precision=precision),\n            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',\n            self.original_extra_repr()\n        ])\n\n    def add_extra_repr(m):\n        m.accumulate_flops = accumulate_flops.__get__(m)\n        m.accumulate_params = accumulate_params.__get__(m)\n        flops_extra_repr = flops_repr.__get__(m)\n        if m.extra_repr != flops_extra_repr:\n            m.original_extra_repr = m.extra_repr\n            m.extra_repr = flops_extra_repr\n            assert m.extra_repr != m.original_extra_repr\n\n    def del_extra_repr(m):\n        if hasattr(m, 'original_extra_repr'):\n            m.extra_repr = m.original_extra_repr\n            del m.original_extra_repr\n        if hasattr(m, 'accumulate_flops'):\n            del m.accumulate_flops\n\n    model.apply(add_extra_repr)\n    print(model, file=ost, flush=flush)\n    model.apply(del_extra_repr)\n\n\ndef get_model_parameters_number(model: nn.Module) -> float:\n    \"\"\"Calculate parameter number of a model.\n\n    Args:\n        model (nn.module): The model for parameter number calculation.\n\n    Returns:\n        float: Parameter number of the model.\n    \"\"\"\n    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n    return num_params\n\n\ndef add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:\n    # adding additional methods to the existing module object,\n    # this is done this way so that each function has access to self object\n    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501\n        net_main_module)\n    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501\n        net_main_module)\n    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501\n        net_main_module)\n    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501\n        net_main_module)\n\n    net_main_module.reset_flops_count()\n\n    return net_main_module\n\n\ndef compute_average_flops_cost(self) -> Tuple[float, float]:\n    \"\"\"Compute average FLOPs cost.\n\n    A method to compute average FLOPs cost, which will be available after\n    `add_flops_counting_methods()` is called on a desired net object.\n\n    Returns:\n        float: Current mean flops consumption per image.\n    \"\"\"\n    batches_count = self.__batch_counter__\n    flops_sum = 0\n    for module in self.modules():\n        if is_supported_instance(module):\n            flops_sum += module.__flops__\n    params_sum = get_model_parameters_number(self)\n    return flops_sum / batches_count, params_sum\n\n\ndef start_flops_count(self) -> None:\n    \"\"\"Activate the computation of mean flops consumption per image.\n\n    A method to activate the computation of mean flops consumption per image.\n    which will be available after ``add_flops_counting_methods()`` is called on\n    a desired net object. It should be called before running the network.\n    \"\"\"\n    add_batch_counter_hook_function(self)\n\n    def add_flops_counter_hook_function(module: nn.Module) -> None:\n        if is_supported_instance(module):\n            if hasattr(module, '__flops_handle__'):\n                return\n\n            else:\n                handle = module.register_forward_hook(\n                    get_modules_mapping()[type(module)])\n\n            module.__flops_handle__ = handle\n\n    self.apply(partial(add_flops_counter_hook_function))\n\n\ndef stop_flops_count(self) -> None:\n    \"\"\"Stop computing the mean flops consumption per image.\n\n    A method to stop computing the mean flops consumption per image, which will\n    be available after ``add_flops_counting_methods()`` is called on a desired\n    net object. It can be called to pause the computation whenever.\n    \"\"\"\n    remove_batch_counter_hook_function(self)\n    self.apply(remove_flops_counter_hook_function)\n\n\ndef reset_flops_count(self) -> None:\n    \"\"\"Reset statistics computed so far.\n\n    A method to Reset computed statistics, which will be available after\n    `add_flops_counting_methods()` is called on a desired net object.\n    \"\"\"\n    add_batch_counter_variables_or_reset(self)\n    self.apply(add_flops_counter_variable_or_reset)\n\n\n# ---- Internal functions\ndef empty_flops_counter_hook(module: nn.Module, input: tuple,\n                             output: Any) -> None:\n    module.__flops__ += 0\n\n\ndef upsample_flops_counter_hook(module: nn.Module, input: tuple,\n                                output: torch.Tensor) -> None:\n    output_size = output[0]\n    batch_size = output_size.shape[0]\n    output_elements_count = batch_size\n    for val in output_size.shape[1:]:\n        output_elements_count *= val\n    module.__flops__ += int(output_elements_count)\n\n\ndef relu_flops_counter_hook(module: nn.Module, input: tuple,\n                            output: torch.Tensor) -> None:\n    active_elements_count = output.numel()\n    module.__flops__ += int(active_elements_count)\n\n\ndef linear_flops_counter_hook(module: nn.Module, input: tuple,\n                              output: torch.Tensor) -> None:\n    output_last_dim = output.shape[\n        -1]  # pytorch checks dimensions, so here we don't care much\n    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)\n\n\ndef pool_flops_counter_hook(module: nn.Module, input: tuple,\n                            output: torch.Tensor) -> None:\n    module.__flops__ += int(np.prod(input[0].shape))\n\n\ndef norm_flops_counter_hook(module: nn.Module, input: tuple,\n                            output: torch.Tensor) -> None:\n    batch_flops = np.prod(input[0].shape)\n    if (getattr(module, 'affine', False)\n            or getattr(module, 'elementwise_affine', False)):\n        batch_flops *= 2\n    module.__flops__ += int(batch_flops)\n\n\ndef deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,\n                              output: torch.Tensor) -> None:\n    # Can have multiple inputs, getting the first one\n    batch_size = input[0].shape[0]\n    input_height, input_width = input[0].shape[2:]\n\n    kernel_height, kernel_width = conv_module.kernel_size\n    in_channels = conv_module.in_channels\n    out_channels = conv_module.out_channels\n    groups = conv_module.groups\n\n    filters_per_channel = out_channels // groups\n    conv_per_position_flops = (\n        kernel_height * kernel_width * in_channels * filters_per_channel)\n\n    active_elements_count = batch_size * input_height * input_width\n    overall_conv_flops = conv_per_position_flops * active_elements_count\n    bias_flops = 0\n    if conv_module.bias is not None:\n        output_height, output_width = output.shape[2:]\n        bias_flops = out_channels * batch_size * output_height * output_width\n    overall_flops = overall_conv_flops + bias_flops\n\n    conv_module.__flops__ += int(overall_flops)\n\n\ndef conv_flops_counter_hook(conv_module: nn.Module, input: tuple,\n                            output: torch.Tensor) -> None:\n    # Can have multiple inputs, getting the first one\n    batch_size = input[0].shape[0]\n    output_dims = list(output.shape[2:])\n\n    kernel_dims = list(conv_module.kernel_size)\n    in_channels = conv_module.in_channels\n    out_channels = conv_module.out_channels\n    groups = conv_module.groups\n\n    filters_per_channel = out_channels // groups\n    conv_per_position_flops = int(\n        np.prod(kernel_dims)) * in_channels * filters_per_channel\n\n    active_elements_count = batch_size * int(np.prod(output_dims))\n\n    overall_conv_flops = conv_per_position_flops * active_elements_count\n\n    bias_flops = 0\n\n    if conv_module.bias is not None:\n\n        bias_flops = out_channels * active_elements_count\n\n    overall_flops = overall_conv_flops + bias_flops\n\n    conv_module.__flops__ += int(overall_flops)\n\n\ndef batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:\n    batch_size = 1\n    if len(input) > 0:\n        # Can have multiple inputs, getting the first one\n        batch_size = len(input[0])\n    else:\n        warnings.warn('No positional inputs found for a module, '\n                      'assuming batch size is 1.')\n    module.__batch_counter__ += batch_size\n\n\ndef add_batch_counter_variables_or_reset(module: nn.Module) -> None:\n\n    module.__batch_counter__ = 0\n\n\ndef add_batch_counter_hook_function(module: nn.Module) -> None:\n    if hasattr(module, '__batch_counter_handle__'):\n        return\n\n    handle = module.register_forward_hook(batch_counter_hook)\n    module.__batch_counter_handle__ = handle\n\n\ndef remove_batch_counter_hook_function(module: nn.Module) -> None:\n    if hasattr(module, '__batch_counter_handle__'):\n        module.__batch_counter_handle__.remove()\n        del module.__batch_counter_handle__\n\n\ndef add_flops_counter_variable_or_reset(module: nn.Module) -> None:\n    if is_supported_instance(module):\n        if hasattr(module, '__flops__') or hasattr(module, '__params__'):\n            warnings.warn('variables __flops__ or __params__ are already '\n                          'defined for the module' + type(module).__name__ +\n                          ' ptflops can affect your code!')\n        module.__flops__ = 0\n        module.__params__ = get_model_parameters_number(module)\n\n\ndef is_supported_instance(module: nn.Module) -> bool:\n    if type(module) in get_modules_mapping():\n        return True\n    return False\n\n\ndef remove_flops_counter_hook_function(module: nn.Module) -> None:\n    if is_supported_instance(module):\n        if hasattr(module, '__flops_handle__'):\n            module.__flops_handle__.remove()\n            del module.__flops_handle__\n\n\ndef get_modules_mapping() -> Dict:\n    return {\n        # convolutions\n        nn.Conv1d: conv_flops_counter_hook,\n        nn.Conv2d: conv_flops_counter_hook,\n        Conv2d: conv_flops_counter_hook,\n        nn.Conv3d: conv_flops_counter_hook,\n        Conv3d: conv_flops_counter_hook,\n        # activations\n        nn.ReLU: relu_flops_counter_hook,\n        nn.PReLU: relu_flops_counter_hook,\n        nn.ELU: relu_flops_counter_hook,\n        nn.LeakyReLU: relu_flops_counter_hook,\n        nn.ReLU6: relu_flops_counter_hook,\n        # poolings\n        nn.MaxPool1d: pool_flops_counter_hook,\n        nn.AvgPool1d: pool_flops_counter_hook,\n        nn.AvgPool2d: pool_flops_counter_hook,\n        nn.MaxPool2d: pool_flops_counter_hook,\n        MaxPool2d: pool_flops_counter_hook,\n        nn.MaxPool3d: pool_flops_counter_hook,\n        MaxPool3d: pool_flops_counter_hook,\n        nn.AvgPool3d: pool_flops_counter_hook,\n        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,\n        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,\n        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,\n        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,\n        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,\n        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,\n        # normalizations\n        nn.BatchNorm1d: norm_flops_counter_hook,\n        nn.BatchNorm2d: norm_flops_counter_hook,\n        nn.BatchNorm3d: norm_flops_counter_hook,\n        nn.GroupNorm: norm_flops_counter_hook,\n        nn.InstanceNorm1d: norm_flops_counter_hook,\n        nn.InstanceNorm2d: norm_flops_counter_hook,\n        nn.InstanceNorm3d: norm_flops_counter_hook,\n        nn.LayerNorm: norm_flops_counter_hook,\n        # FC\n        nn.Linear: linear_flops_counter_hook,\n        Linear: linear_flops_counter_hook,\n        # Upscale\n        nn.Upsample: upsample_flops_counter_hook,\n        # Deconvolution\n        nn.ConvTranspose2d: deconv_flops_counter_hook,\n        ConvTranspose2d: deconv_flops_counter_hook,\n    }\n"
  },
  {
    "path": "mmcv/cnn/utils/fuse_conv_bn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\n\n\ndef _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:\n    \"\"\"Fuse conv and bn into one module.\n\n    Args:\n        conv (nn.Module): Conv to be fused.\n        bn (nn.Module): BN to be fused.\n\n    Returns:\n        nn.Module: Fused module.\n    \"\"\"\n    conv_w = conv.weight\n    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(\n        bn.running_mean)\n\n    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)\n    conv.weight = nn.Parameter(conv_w *\n                               factor.reshape([conv.out_channels, 1, 1, 1]))\n    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)\n    return conv\n\n\ndef fuse_conv_bn(module: nn.Module) -> nn.Module:\n    \"\"\"Recursively fuse conv and bn in a module.\n\n    During inference, the functionary of batch norm layers is turned off\n    but only the mean and var alone channels are used, which exposes the\n    chance to fuse it with the preceding conv layers to save computations and\n    simplify network structures.\n\n    Args:\n        module (nn.Module): Module to be fused.\n\n    Returns:\n        nn.Module: Fused module.\n    \"\"\"\n    last_conv = None\n    last_conv_name = None\n\n    for name, child in module.named_children():\n        if isinstance(child,\n                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):\n            if last_conv is None:  # only fuse BN that is after Conv\n                continue\n            fused_conv = _fuse_conv_bn(last_conv, child)\n            module._modules[last_conv_name] = fused_conv\n            # To reduce changes, set BN as Identity instead of deleting it.\n            module._modules[name] = nn.Identity()\n            last_conv = None\n        elif isinstance(child, nn.Conv2d):\n            last_conv = child\n            last_conv_name = name\n        else:\n            fuse_conv_bn(child)\n    return module\n"
  },
  {
    "path": "mmcv/cnn/vgg.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport logging\nfrom typing import List, Optional, Sequence, Tuple, Union\n\nimport torch.nn as nn\nfrom mmengine.model import constant_init, kaiming_init, normal_init\nfrom mmengine.runner import load_checkpoint\nfrom torch import Tensor\n\n\ndef conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:\n    \"\"\"3x3 convolution with padding.\"\"\"\n    return nn.Conv2d(\n        in_planes,\n        out_planes,\n        kernel_size=3,\n        padding=dilation,\n        dilation=dilation)\n\n\ndef make_vgg_layer(inplanes: int,\n                   planes: int,\n                   num_blocks: int,\n                   dilation: int = 1,\n                   with_bn: bool = False,\n                   ceil_mode: bool = False) -> List[nn.Module]:\n    layers = []\n    for _ in range(num_blocks):\n        layers.append(conv3x3(inplanes, planes, dilation))\n        if with_bn:\n            layers.append(nn.BatchNorm2d(planes))\n        layers.append(nn.ReLU(inplace=True))\n        inplanes = planes\n    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))\n\n    return layers\n\n\nclass VGG(nn.Module):\n    \"\"\"VGG backbone.\n\n    Args:\n        depth (int): Depth of vgg, from {11, 13, 16, 19}.\n        with_bn (bool): Use BatchNorm or not.\n        num_classes (int): number of classes for classification.\n        num_stages (int): VGG stages, normally 5.\n        dilations (Sequence[int]): Dilation of each stage.\n        out_indices (Sequence[int]): Output from which stages.\n        frozen_stages (int): Stages to be frozen (all param fixed). -1 means\n            not freezing any parameters.\n        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze\n            running stats (mean and var).\n        bn_frozen (bool): Whether to freeze weight and bias of BN layers.\n    \"\"\"\n\n    arch_settings = {\n        11: (1, 1, 2, 2, 2),\n        13: (2, 2, 2, 2, 2),\n        16: (2, 2, 3, 3, 3),\n        19: (2, 2, 4, 4, 4)\n    }\n\n    def __init__(self,\n                 depth: int,\n                 with_bn: bool = False,\n                 num_classes: int = -1,\n                 num_stages: int = 5,\n                 dilations: Sequence[int] = (1, 1, 1, 1, 1),\n                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),\n                 frozen_stages: int = -1,\n                 bn_eval: bool = True,\n                 bn_frozen: bool = False,\n                 ceil_mode: bool = False,\n                 with_last_pool: bool = True):\n        super().__init__()\n        if depth not in self.arch_settings:\n            raise KeyError(f'invalid depth {depth} for vgg')\n        assert num_stages >= 1 and num_stages <= 5\n        stage_blocks = self.arch_settings[depth]\n        self.stage_blocks = stage_blocks[:num_stages]\n        assert len(dilations) == num_stages\n        assert max(out_indices) <= num_stages\n\n        self.num_classes = num_classes\n        self.out_indices = out_indices\n        self.frozen_stages = frozen_stages\n        self.bn_eval = bn_eval\n        self.bn_frozen = bn_frozen\n\n        self.inplanes = 3\n        start_idx = 0\n        vgg_layers = []\n        self.range_sub_modules = []\n        for i, num_blocks in enumerate(self.stage_blocks):\n            num_modules = num_blocks * (2 + with_bn) + 1\n            end_idx = start_idx + num_modules\n            dilation = dilations[i]\n            planes = 64 * 2**i if i < 4 else 512\n            vgg_layer = make_vgg_layer(\n                self.inplanes,\n                planes,\n                num_blocks,\n                dilation=dilation,\n                with_bn=with_bn,\n                ceil_mode=ceil_mode)\n            vgg_layers.extend(vgg_layer)\n            self.inplanes = planes\n            self.range_sub_modules.append([start_idx, end_idx])\n            start_idx = end_idx\n        if not with_last_pool:\n            vgg_layers.pop(-1)\n            self.range_sub_modules[-1][1] -= 1\n        self.module_name = 'features'\n        self.add_module(self.module_name, nn.Sequential(*vgg_layers))\n\n        if self.num_classes > 0:\n            self.classifier = nn.Sequential(\n                nn.Linear(512 * 7 * 7, 4096),\n                nn.ReLU(True),\n                nn.Dropout(),\n                nn.Linear(4096, 4096),\n                nn.ReLU(True),\n                nn.Dropout(),\n                nn.Linear(4096, num_classes),\n            )\n\n    def init_weights(self, pretrained: Optional[str] = None) -> None:\n        if isinstance(pretrained, str):\n            logger = logging.getLogger()\n            load_checkpoint(self, pretrained, strict=False, logger=logger)\n        elif pretrained is None:\n            for m in self.modules():\n                if isinstance(m, nn.Conv2d):\n                    kaiming_init(m)\n                elif isinstance(m, nn.BatchNorm2d):\n                    constant_init(m, 1)\n                elif isinstance(m, nn.Linear):\n                    normal_init(m, std=0.01)\n        else:\n            raise TypeError('pretrained must be a str or None')\n\n    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:\n        outs = []\n        vgg_layers = getattr(self, self.module_name)\n        for i in range(len(self.stage_blocks)):\n            for j in range(*self.range_sub_modules[i]):\n                vgg_layer = vgg_layers[j]\n                x = vgg_layer(x)\n            if i in self.out_indices:\n                outs.append(x)\n        if self.num_classes > 0:\n            x = x.view(x.size(0), -1)\n            x = self.classifier(x)\n            outs.append(x)\n        if len(outs) == 1:\n            return outs[0]\n        else:\n            return tuple(outs)\n\n    def train(self, mode: bool = True) -> None:\n        super().train(mode)\n        if self.bn_eval:\n            for m in self.modules():\n                if isinstance(m, nn.BatchNorm2d):\n                    m.eval()\n                    if self.bn_frozen:\n                        for params in m.parameters():\n                            params.requires_grad = False\n        vgg_layers = getattr(self, self.module_name)\n        if mode and self.frozen_stages >= 0:\n            for i in range(self.frozen_stages):\n                for j in range(*self.range_sub_modules[i]):\n                    mod = vgg_layers[j]\n                    mod.eval()\n                    for param in mod.parameters():\n                        param.requires_grad = False\n"
  },
  {
    "path": "mmcv/image/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,\n                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,\n                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)\nfrom .geometric import (cutout, imcrop, imflip, imflip_, impad,\n                        impad_to_multiple, imrescale, imresize, imresize_like,\n                        imresize_to_multiple, imrotate, imshear, imtranslate,\n                        rescale_size)\nfrom .io import imfrombytes, imread, imwrite, supported_backends, use_backend\nfrom .misc import tensor2imgs\nfrom .photometric import (adjust_brightness, adjust_color, adjust_contrast,\n                          adjust_hue, adjust_lighting, adjust_sharpness,\n                          auto_contrast, clahe, imdenormalize, imequalize,\n                          iminvert, imnormalize, imnormalize_, lut_transform,\n                          posterize, solarize)\n\n__all__ = [\n    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',\n    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',\n    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',\n    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',\n    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',\n    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',\n    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',\n    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',\n    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',\n    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',\n    'adjust_hue'\n]\n"
  },
  {
    "path": "mmcv/image/colorspace.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Callable, Union\n\nimport cv2\nimport numpy as np\n\n\ndef imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:\n    \"\"\"Convert an image from the src colorspace to dst colorspace.\n\n    Args:\n        img (ndarray): The input image.\n        src (str): The source colorspace, e.g., 'rgb', 'hsv'.\n        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.\n\n    Returns:\n        ndarray: The converted image.\n    \"\"\"\n    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')\n    out_img = cv2.cvtColor(img, code)\n    return out_img\n\n\ndef bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:\n    \"\"\"Convert a BGR image to grayscale image.\n\n    Args:\n        img (ndarray): The input image.\n        keepdim (bool): If False (by default), then return the grayscale image\n            with 2 dims, otherwise 3 dims.\n\n    Returns:\n        ndarray: The converted grayscale image.\n    \"\"\"\n    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n    if keepdim:\n        out_img = out_img[..., None]\n    return out_img\n\n\ndef rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:\n    \"\"\"Convert a RGB image to grayscale image.\n\n    Args:\n        img (ndarray): The input image.\n        keepdim (bool): If False (by default), then return the grayscale image\n            with 2 dims, otherwise 3 dims.\n\n    Returns:\n        ndarray: The converted grayscale image.\n    \"\"\"\n    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)\n    if keepdim:\n        out_img = out_img[..., None]\n    return out_img\n\n\ndef gray2bgr(img: np.ndarray) -> np.ndarray:\n    \"\"\"Convert a grayscale image to BGR image.\n\n    Args:\n        img (ndarray): The input image.\n\n    Returns:\n        ndarray: The converted BGR image.\n    \"\"\"\n    img = img[..., None] if img.ndim == 2 else img\n    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)\n    return out_img\n\n\ndef gray2rgb(img: np.ndarray) -> np.ndarray:\n    \"\"\"Convert a grayscale image to RGB image.\n\n    Args:\n        img (ndarray): The input image.\n\n    Returns:\n        ndarray: The converted RGB image.\n    \"\"\"\n    img = img[..., None] if img.ndim == 2 else img\n    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)\n    return out_img\n\n\ndef _convert_input_type_range(img: np.ndarray) -> np.ndarray:\n    \"\"\"Convert the type and range of the input image.\n\n    It converts the input image to np.float32 type and range of [0, 1].\n    It is mainly used for pre-processing the input image in colorspace\n    conversion functions such as rgb2ycbcr and ycbcr2rgb.\n\n    Args:\n        img (ndarray): The input image. It accepts:\n            1. np.uint8 type with range [0, 255];\n            2. np.float32 type with range [0, 1].\n\n    Returns:\n        (ndarray): The converted image with type of np.float32 and range of\n            [0, 1].\n    \"\"\"\n    img_type = img.dtype\n    img = img.astype(np.float32)\n    if img_type == np.float32:\n        pass\n    elif img_type == np.uint8:\n        img /= 255.\n    else:\n        raise TypeError('The img type should be np.float32 or np.uint8, '\n                        f'but got {img_type}')\n    return img\n\n\ndef _convert_output_type_range(\n        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:\n    \"\"\"Convert the type and range of the image according to dst_type.\n\n    It converts the image to desired type and range. If `dst_type` is np.uint8,\n    images will be converted to np.uint8 type with range [0, 255]. If\n    `dst_type` is np.float32, it converts the image to np.float32 type with\n    range [0, 1].\n    It is mainly used for post-processing images in colorspace conversion\n    functions such as rgb2ycbcr and ycbcr2rgb.\n\n    Args:\n        img (ndarray): The image to be converted with np.float32 type and\n            range [0, 255].\n        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it\n            converts the image to np.uint8 type with range [0, 255]. If\n            dst_type is np.float32, it converts the image to np.float32 type\n            with range [0, 1].\n\n    Returns:\n        (ndarray): The converted image with desired type and range.\n    \"\"\"\n    if dst_type not in (np.uint8, np.float32):\n        raise TypeError('The dst_type should be np.float32 or np.uint8, '\n                        f'but got {dst_type}')\n    if dst_type == np.uint8:\n        img = img.round()\n    else:\n        img /= 255.\n    return img.astype(dst_type)\n\n\ndef rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:\n    \"\"\"Convert a RGB image to YCbCr image.\n\n    This function produces the same results as Matlab's `rgb2ycbcr` function.\n    It implements the ITU-R BT.601 conversion for standard-definition\n    television. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.\n\n    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.\n    In OpenCV, it implements a JPEG conversion. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.\n\n    Args:\n        img (ndarray): The input image. It accepts:\n            1. np.uint8 type with range [0, 255];\n            2. np.float32 type with range [0, 1].\n        y_only (bool): Whether to only return Y channel. Default: False.\n\n    Returns:\n        ndarray: The converted YCbCr image. The output image has the same type\n        and range as input image.\n    \"\"\"\n    img_type = img.dtype\n    img = _convert_input_type_range(img)\n    if y_only:\n        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0\n    else:\n        out_img = np.matmul(\n            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],\n                  [24.966, 112.0, -18.214]]) + [16, 128, 128]\n    out_img = _convert_output_type_range(out_img, img_type)\n    return out_img\n\n\ndef bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:\n    \"\"\"Convert a BGR image to YCbCr image.\n\n    The bgr version of rgb2ycbcr.\n    It implements the ITU-R BT.601 conversion for standard-definition\n    television. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.\n\n    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.\n    In OpenCV, it implements a JPEG conversion. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.\n\n    Args:\n        img (ndarray): The input image. It accepts:\n            1. np.uint8 type with range [0, 255];\n            2. np.float32 type with range [0, 1].\n        y_only (bool): Whether to only return Y channel. Default: False.\n\n    Returns:\n        ndarray: The converted YCbCr image. The output image has the same type\n        and range as input image.\n    \"\"\"\n    img_type = img.dtype\n    img = _convert_input_type_range(img)\n    if y_only:\n        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0\n    else:\n        out_img = np.matmul(\n            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],\n                  [65.481, -37.797, 112.0]]) + [16, 128, 128]\n    out_img = _convert_output_type_range(out_img, img_type)\n    return out_img\n\n\ndef ycbcr2rgb(img: np.ndarray) -> np.ndarray:\n    \"\"\"Convert a YCbCr image to RGB image.\n\n    This function produces the same results as Matlab's ycbcr2rgb function.\n    It implements the ITU-R BT.601 conversion for standard-definition\n    television. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.\n\n    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.\n    In OpenCV, it implements a JPEG conversion. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.\n\n    Args:\n        img (ndarray): The input image. It accepts:\n            1. np.uint8 type with range [0, 255];\n            2. np.float32 type with range [0, 1].\n\n    Returns:\n        ndarray: The converted RGB image. The output image has the same type\n        and range as input image.\n    \"\"\"\n    img_type = img.dtype\n    img = _convert_input_type_range(img) * 255\n    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],\n                              [0, -0.00153632, 0.00791071],\n                              [0.00625893, -0.00318811, 0]]) * 255.0 + [\n                                  -222.921, 135.576, -276.836\n                              ]\n    out_img = _convert_output_type_range(out_img, img_type)\n    return out_img\n\n\ndef ycbcr2bgr(img: np.ndarray) -> np.ndarray:\n    \"\"\"Convert a YCbCr image to BGR image.\n\n    The bgr version of ycbcr2rgb.\n    It implements the ITU-R BT.601 conversion for standard-definition\n    television. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.\n\n    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.\n    In OpenCV, it implements a JPEG conversion. See more details in\n    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.\n\n    Args:\n        img (ndarray): The input image. It accepts:\n            1. np.uint8 type with range [0, 255];\n            2. np.float32 type with range [0, 1].\n\n    Returns:\n        ndarray: The converted BGR image. The output image has the same type\n        and range as input image.\n    \"\"\"\n    img_type = img.dtype\n    img = _convert_input_type_range(img) * 255\n    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],\n                              [0.00791071, -0.00153632, 0],\n                              [0, -0.00318811, 0.00625893]]) * 255.0 + [\n                                  -276.836, 135.576, -222.921\n                              ]\n    out_img = _convert_output_type_range(out_img, img_type)\n    return out_img\n\n\ndef convert_color_factory(src: str, dst: str) -> Callable:\n\n    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')\n\n    def convert_color(img: np.ndarray) -> np.ndarray:\n        out_img = cv2.cvtColor(img, code)\n        return out_img\n\n    convert_color.__doc__ = f\"\"\"Convert a {src.upper()} image to {dst.upper()}\n        image.\n\n    Args:\n        img (ndarray or str): The input image.\n\n    Returns:\n        ndarray: The converted {dst.upper()} image.\n    \"\"\"\n\n    return convert_color\n\n\nbgr2rgb = convert_color_factory('bgr', 'rgb')\n\nrgb2bgr = convert_color_factory('rgb', 'bgr')\n\nbgr2hsv = convert_color_factory('bgr', 'hsv')\n\nhsv2bgr = convert_color_factory('hsv', 'bgr')\n\nbgr2hls = convert_color_factory('bgr', 'hls')\n\nhls2bgr = convert_color_factory('hls', 'bgr')\n"
  },
  {
    "path": "mmcv/image/geometric.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numbers\nfrom typing import List, Optional, Tuple, Union, no_type_check\n\nimport cv2\nimport numpy as np\nfrom mmengine.utils import to_2tuple\n\nfrom .io import imread_backend\n\ntry:\n    from PIL import Image\nexcept ImportError:\n    Image = None\n\n\ndef _scale_size(\n    size: Tuple[int, int],\n    scale: Union[float, int, Tuple[float, float], Tuple[int, int]],\n) -> Tuple[int, int]:\n    \"\"\"Rescale a size by a ratio.\n\n    Args:\n        size (tuple[int]): (w, h).\n        scale (float | int | tuple(float) | tuple(int)): Scaling factor.\n\n    Returns:\n        tuple[int]: scaled size.\n    \"\"\"\n    if isinstance(scale, (float, int)):\n        scale = (scale, scale)\n    w, h = size\n    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)\n\n\ncv2_interp_codes = {\n    'nearest': cv2.INTER_NEAREST,\n    'bilinear': cv2.INTER_LINEAR,\n    'bicubic': cv2.INTER_CUBIC,\n    'area': cv2.INTER_AREA,\n    'lanczos': cv2.INTER_LANCZOS4\n}\n\ncv2_border_modes = {\n    'constant': cv2.BORDER_CONSTANT,\n    'replicate': cv2.BORDER_REPLICATE,\n    'reflect': cv2.BORDER_REFLECT,\n    'wrap': cv2.BORDER_WRAP,\n    'reflect_101': cv2.BORDER_REFLECT_101,\n    'transparent': cv2.BORDER_TRANSPARENT,\n    'isolated': cv2.BORDER_ISOLATED\n}\n\n# Pillow >=v9.1.0 use a slightly different naming scheme for filters.\n# Set pillow_interp_codes according to the naming scheme used.\nif Image is not None:\n    if hasattr(Image, 'Resampling'):\n        pillow_interp_codes = {\n            'nearest': Image.Resampling.NEAREST,\n            'bilinear': Image.Resampling.BILINEAR,\n            'bicubic': Image.Resampling.BICUBIC,\n            'box': Image.Resampling.BOX,\n            'lanczos': Image.Resampling.LANCZOS,\n            'hamming': Image.Resampling.HAMMING\n        }\n    else:\n        pillow_interp_codes = {\n            'nearest': Image.NEAREST,\n            'bilinear': Image.BILINEAR,\n            'bicubic': Image.BICUBIC,\n            'box': Image.BOX,\n            'lanczos': Image.LANCZOS,\n            'hamming': Image.HAMMING\n        }\n\n\ndef imresize(\n    img: np.ndarray,\n    size: Tuple[int, int],\n    return_scale: bool = False,\n    interpolation: str = 'bilinear',\n    out: Optional[np.ndarray] = None,\n    backend: Optional[str] = None\n) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:\n    \"\"\"Resize image to a given size.\n\n    Args:\n        img (ndarray): The input image.\n        size (tuple[int]): Target size (w, h).\n        return_scale (bool): Whether to return `w_scale` and `h_scale`.\n        interpolation (str): Interpolation method, accepted values are\n            \"nearest\", \"bilinear\", \"bicubic\", \"area\", \"lanczos\" for 'cv2'\n            backend, \"nearest\", \"bilinear\" for 'pillow' backend.\n        out (ndarray): The output destination.\n        backend (str | None): The image resize backend type. Options are `cv2`,\n            `pillow`, `None`. If backend is None, the global imread_backend\n            specified by ``mmcv.use_backend()`` will be used. Default: None.\n\n    Returns:\n        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or\n        `resized_img`.\n    \"\"\"\n    h, w = img.shape[:2]\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported for resize.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        pil_image = Image.fromarray(img)\n        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])\n        resized_img = np.array(pil_image)\n    else:\n        resized_img = cv2.resize(\n            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])\n    if not return_scale:\n        return resized_img\n    else:\n        w_scale = size[0] / w\n        h_scale = size[1] / h\n        return resized_img, w_scale, h_scale\n\n\n@no_type_check\ndef imresize_to_multiple(\n    img: np.ndarray,\n    divisor: Union[int, Tuple[int, int]],\n    size: Union[int, Tuple[int, int], None] = None,\n    scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],\n                        None] = None,\n    keep_ratio: bool = False,\n    return_scale: bool = False,\n    interpolation: str = 'bilinear',\n    out: Optional[np.ndarray] = None,\n    backend: Optional[str] = None\n) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:\n    \"\"\"Resize image according to a given size or scale factor and then rounds\n    up the the resized or rescaled image size to the nearest value that can be\n    divided by the divisor.\n\n    Args:\n        img (ndarray): The input image.\n        divisor (int | tuple): Resized image size will be a multiple of\n            divisor. If divisor is a tuple, divisor should be\n            (w_divisor, h_divisor).\n        size (None | int | tuple[int]): Target size (w, h). Default: None.\n        scale_factor (None | float | int | tuple[float] | tuple[int]):\n            Multiplier for spatial size. Should match input size if it is a\n            tuple and the 2D style is (w_scale_factor, h_scale_factor).\n            Default: None.\n        keep_ratio (bool): Whether to keep the aspect ratio when resizing the\n            image. Default: False.\n        return_scale (bool): Whether to return `w_scale` and `h_scale`.\n        interpolation (str): Interpolation method, accepted values are\n            \"nearest\", \"bilinear\", \"bicubic\", \"area\", \"lanczos\" for 'cv2'\n            backend, \"nearest\", \"bilinear\" for 'pillow' backend.\n        out (ndarray): The output destination.\n        backend (str | None): The image resize backend type. Options are `cv2`,\n            `pillow`, `None`. If backend is None, the global imread_backend\n            specified by ``mmcv.use_backend()`` will be used. Default: None.\n\n    Returns:\n        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or\n        `resized_img`.\n    \"\"\"\n    h, w = img.shape[:2]\n    if size is not None and scale_factor is not None:\n        raise ValueError('only one of size or scale_factor should be defined')\n    elif size is None and scale_factor is None:\n        raise ValueError('one of size or scale_factor should be defined')\n    elif size is not None:\n        size = to_2tuple(size)\n        if keep_ratio:\n            size = rescale_size((w, h), size, return_scale=False)\n    else:\n        size = _scale_size((w, h), scale_factor)\n\n    divisor = to_2tuple(divisor)\n    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))\n    resized_img, w_scale, h_scale = imresize(\n        img,\n        size,\n        return_scale=True,\n        interpolation=interpolation,\n        out=out,\n        backend=backend)\n    if return_scale:\n        return resized_img, w_scale, h_scale\n    else:\n        return resized_img\n\n\ndef imresize_like(\n    img: np.ndarray,\n    dst_img: np.ndarray,\n    return_scale: bool = False,\n    interpolation: str = 'bilinear',\n    backend: Optional[str] = None\n) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:\n    \"\"\"Resize image to the same size of a given image.\n\n    Args:\n        img (ndarray): The input image.\n        dst_img (ndarray): The target image.\n        return_scale (bool): Whether to return `w_scale` and `h_scale`.\n        interpolation (str): Same as :func:`resize`.\n        backend (str | None): Same as :func:`resize`.\n\n    Returns:\n        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or\n        `resized_img`.\n    \"\"\"\n    h, w = dst_img.shape[:2]\n    return imresize(img, (w, h), return_scale, interpolation, backend=backend)\n\n\ndef rescale_size(old_size: tuple,\n                 scale: Union[float, int, Tuple[int, int]],\n                 return_scale: bool = False) -> tuple:\n    \"\"\"Calculate the new size to be rescaled to.\n\n    Args:\n        old_size (tuple[int]): The old size (w, h) of image.\n        scale (float | int | tuple[int]): The scaling factor or maximum size.\n            If it is a float number or an integer, then the image will be\n            rescaled by this factor, else if it is a tuple of 2 integers, then\n            the image will be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image size.\n\n    Returns:\n        tuple[int]: The new rescaled image size.\n    \"\"\"\n    w, h = old_size\n    if isinstance(scale, (float, int)):\n        if scale <= 0:\n            raise ValueError(f'Invalid scale {scale}, must be positive.')\n        scale_factor = scale\n    elif isinstance(scale, tuple):\n        max_long_edge = max(scale)\n        max_short_edge = min(scale)\n        scale_factor = min(max_long_edge / max(h, w),\n                           max_short_edge / min(h, w))\n    else:\n        raise TypeError(\n            f'Scale must be a number or tuple of int, but got {type(scale)}')\n\n    new_size = _scale_size((w, h), scale_factor)\n\n    if return_scale:\n        return new_size, scale_factor\n    else:\n        return new_size\n\n\ndef imrescale(\n    img: np.ndarray,\n    scale: Union[float, int, Tuple[int, int]],\n    return_scale: bool = False,\n    interpolation: str = 'bilinear',\n    backend: Optional[str] = None\n) -> Union[np.ndarray, Tuple[np.ndarray, float]]:\n    \"\"\"Resize image while keeping the aspect ratio.\n\n    Args:\n        img (ndarray): The input image.\n        scale (float | int | tuple[int]): The scaling factor or maximum size.\n            If it is a float number or an integer, then the image will be\n            rescaled by this factor, else if it is a tuple of 2 integers, then\n            the image will be rescaled as large as possible within the scale.\n        return_scale (bool): Whether to return the scaling factor besides the\n            rescaled image.\n        interpolation (str): Same as :func:`resize`.\n        backend (str | None): Same as :func:`resize`.\n\n    Returns:\n        ndarray: The rescaled image.\n    \"\"\"\n    h, w = img.shape[:2]\n    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)\n    rescaled_img = imresize(\n        img, new_size, interpolation=interpolation, backend=backend)\n    if return_scale:\n        return rescaled_img, scale_factor\n    else:\n        return rescaled_img\n\n\ndef imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:\n    \"\"\"Flip an image horizontally or vertically.\n\n    Args:\n        img (ndarray): Image to be flipped.\n        direction (str): The flip direction, either \"horizontal\" or\n            \"vertical\" or \"diagonal\".\n\n    Returns:\n        ndarray: The flipped image.\n    \"\"\"\n    assert direction in ['horizontal', 'vertical', 'diagonal']\n    if direction == 'horizontal':\n        return np.flip(img, axis=1)\n    elif direction == 'vertical':\n        return np.flip(img, axis=0)\n    else:\n        return np.flip(img, axis=(0, 1))\n\n\ndef imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:\n    \"\"\"Inplace flip an image horizontally or vertically.\n\n    Args:\n        img (ndarray): Image to be flipped.\n        direction (str): The flip direction, either \"horizontal\" or\n            \"vertical\" or \"diagonal\".\n\n    Returns:\n        ndarray: The flipped image (inplace).\n    \"\"\"\n    assert direction in ['horizontal', 'vertical', 'diagonal']\n    if direction == 'horizontal':\n        return cv2.flip(img, 1, img)\n    elif direction == 'vertical':\n        return cv2.flip(img, 0, img)\n    else:\n        return cv2.flip(img, -1, img)\n\n\ndef imrotate(img: np.ndarray,\n             angle: float,\n             center: Optional[Tuple[float, float]] = None,\n             scale: float = 1.0,\n             border_value: int = 0,\n             interpolation: str = 'bilinear',\n             auto_bound: bool = False,\n             border_mode: str = 'constant') -> np.ndarray:\n    \"\"\"Rotate an image.\n\n    Args:\n        img (np.ndarray): Image to be rotated.\n        angle (float): Rotation angle in degrees, positive values mean\n            clockwise rotation.\n        center (tuple[float], optional): Center point (w, h) of the rotation in\n            the source image. If not specified, the center of the image will be\n            used.\n        scale (float): Isotropic scale factor.\n        border_value (int): Border value used in case of a constant border.\n            Defaults to 0.\n        interpolation (str): Same as :func:`resize`.\n        auto_bound (bool): Whether to adjust the image size to cover the whole\n            rotated image.\n        border_mode (str): Pixel extrapolation method. Defaults to 'constant'.\n\n    Returns:\n        np.ndarray: The rotated image.\n    \"\"\"\n    if center is not None and auto_bound:\n        raise ValueError('`auto_bound` conflicts with `center`')\n    h, w = img.shape[:2]\n    if center is None:\n        center = ((w - 1) * 0.5, (h - 1) * 0.5)\n    assert isinstance(center, tuple)\n\n    matrix = cv2.getRotationMatrix2D(center, -angle, scale)\n    if auto_bound:\n        cos = np.abs(matrix[0, 0])\n        sin = np.abs(matrix[0, 1])\n        new_w = h * sin + w * cos\n        new_h = h * cos + w * sin\n        matrix[0, 2] += (new_w - w) * 0.5\n        matrix[1, 2] += (new_h - h) * 0.5\n        w = int(np.round(new_w))\n        h = int(np.round(new_h))\n    rotated = cv2.warpAffine(\n        img,\n        matrix, (w, h),\n        flags=cv2_interp_codes[interpolation],\n        borderMode=cv2_border_modes[border_mode],\n        borderValue=border_value)\n    return rotated\n\n\ndef bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:\n    \"\"\"Clip bboxes to fit the image shape.\n\n    Args:\n        bboxes (ndarray): Shape (..., 4*k)\n        img_shape (tuple[int]): (height, width) of the image.\n\n    Returns:\n        ndarray: Clipped bboxes.\n    \"\"\"\n    assert bboxes.shape[-1] % 4 == 0\n    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)\n    cmin[0::2] = img_shape[1] - 1\n    cmin[1::2] = img_shape[0] - 1\n    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)\n    return clipped_bboxes\n\n\ndef bbox_scaling(bboxes: np.ndarray,\n                 scale: float,\n                 clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:\n    \"\"\"Scaling bboxes w.r.t the box center.\n\n    Args:\n        bboxes (ndarray): Shape(..., 4).\n        scale (float): Scaling factor.\n        clip_shape (tuple[int], optional): If specified, bboxes that exceed the\n            boundary will be clipped according to the given shape (h, w).\n\n    Returns:\n        ndarray: Scaled bboxes.\n    \"\"\"\n    if float(scale) == 1.0:\n        scaled_bboxes = bboxes.copy()\n    else:\n        w = bboxes[..., 2] - bboxes[..., 0] + 1\n        h = bboxes[..., 3] - bboxes[..., 1] + 1\n        dw = (w * (scale - 1)) * 0.5\n        dh = (h * (scale - 1)) * 0.5\n        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)\n    if clip_shape is not None:\n        return bbox_clip(scaled_bboxes, clip_shape)\n    else:\n        return scaled_bboxes\n\n\ndef imcrop(\n    img: np.ndarray,\n    bboxes: np.ndarray,\n    scale: float = 1.0,\n    pad_fill: Union[float, list, None] = None\n) -> Union[np.ndarray, List[np.ndarray]]:\n    \"\"\"Crop image patches.\n\n    3 steps: scale the bboxes -> clip bboxes -> crop and pad.\n\n    Args:\n        img (ndarray): Image to be cropped.\n        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.\n        scale (float, optional): Scale ratio of bboxes, the default value\n            1.0 means no scaling.\n        pad_fill (Number | list[Number]): Value to be filled for padding.\n            Default: None, which means no padding.\n\n    Returns:\n        list[ndarray] | ndarray: The cropped image patches.\n    \"\"\"\n    chn = 1 if img.ndim == 2 else img.shape[2]\n    if pad_fill is not None:\n        if isinstance(pad_fill, (int, float)):\n            pad_fill = [pad_fill for _ in range(chn)]\n        assert len(pad_fill) == chn\n\n    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes\n    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)\n    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)\n\n    patches = []\n    for i in range(clipped_bbox.shape[0]):\n        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])\n        if pad_fill is None:\n            patch = img[y1:y2 + 1, x1:x2 + 1, ...]\n        else:\n            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])\n            patch_h = _y2 - _y1 + 1\n            patch_w = _x2 - _x1 + 1\n            if chn == 1:\n                patch_shape = (patch_h, patch_w)\n            else:\n                patch_shape = (patch_h, patch_w, chn)  # type: ignore\n            patch = np.array(\n                pad_fill, dtype=img.dtype) * np.ones(\n                    patch_shape, dtype=img.dtype)\n            x_start = 0 if _x1 >= 0 else -_x1\n            y_start = 0 if _y1 >= 0 else -_y1\n            w = x2 - x1 + 1\n            h = y2 - y1 + 1\n            patch[y_start:y_start + h, x_start:x_start + w,\n                  ...] = img[y1:y1 + h, x1:x1 + w, ...]\n        patches.append(patch)\n\n    if bboxes.ndim == 1:\n        return patches[0]\n    else:\n        return patches\n\n\ndef impad(img: np.ndarray,\n          *,\n          shape: Optional[Tuple[int, int]] = None,\n          padding: Union[int, tuple, None] = None,\n          pad_val: Union[float, List] = 0,\n          padding_mode: str = 'constant') -> np.ndarray:\n    \"\"\"Pad the given image to a certain shape or pad on all sides with\n    specified padding mode and padding value.\n\n    Args:\n        img (ndarray): Image to be padded.\n        shape (tuple[int]): Expected padding shape (h, w). Default: None.\n        padding (int or tuple[int]): Padding on each border. If a single int is\n            provided this is used to pad all borders. If tuple of length 2 is\n            provided this is the padding on left/right and top/bottom\n            respectively. If a tuple of length 4 is provided this is the\n            padding for the left, top, right and bottom borders respectively.\n            Default: None. Note that `shape` and `padding` can not be both\n            set.\n        pad_val (Number | Sequence[Number]): Values to be filled in padding\n            areas when padding_mode is 'constant'. Default: 0.\n        padding_mode (str): Type of padding. Should be: constant, edge,\n            reflect or symmetric. Default: constant.\n\n            - constant: pads with a constant value, this value is specified\n              with pad_val.\n            - edge: pads with the last value at the edge of the image.\n            - reflect: pads with reflection of image without repeating the last\n              value on the edge. For example, padding [1, 2, 3, 4] with 2\n              elements on both sides in reflect mode will result in\n              [3, 2, 1, 2, 3, 4, 3, 2].\n            - symmetric: pads with reflection of image repeating the last value\n              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on\n              both sides in symmetric mode will result in\n              [2, 1, 1, 2, 3, 4, 4, 3]\n\n    Returns:\n        ndarray: The padded image.\n    \"\"\"\n\n    assert (shape is not None) ^ (padding is not None)\n    if shape is not None:\n        width = max(shape[1] - img.shape[1], 0)\n        height = max(shape[0] - img.shape[0], 0)\n        padding = (0, 0, width, height)\n\n    # check pad_val\n    if isinstance(pad_val, tuple):\n        assert len(pad_val) == img.shape[-1]\n    elif not isinstance(pad_val, numbers.Number):\n        raise TypeError('pad_val must be a int or a tuple. '\n                        f'But received {type(pad_val)}')\n\n    # check padding\n    if isinstance(padding, tuple) and len(padding) in [2, 4]:\n        if len(padding) == 2:\n            padding = (padding[0], padding[1], padding[0], padding[1])\n    elif isinstance(padding, numbers.Number):\n        padding = (padding, padding, padding, padding)\n    else:\n        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'\n                         f'But received {padding}')\n\n    # check padding mode\n    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']\n\n    border_type = {\n        'constant': cv2.BORDER_CONSTANT,\n        'edge': cv2.BORDER_REPLICATE,\n        'reflect': cv2.BORDER_REFLECT_101,\n        'symmetric': cv2.BORDER_REFLECT\n    }\n    img = cv2.copyMakeBorder(\n        img,\n        padding[1],\n        padding[3],\n        padding[0],\n        padding[2],\n        border_type[padding_mode],\n        value=pad_val)\n\n    return img\n\n\ndef impad_to_multiple(img: np.ndarray,\n                      divisor: int,\n                      pad_val: Union[float, List] = 0) -> np.ndarray:\n    \"\"\"Pad an image to ensure each edge to be multiple to some number.\n\n    Args:\n        img (ndarray): Image to be padded.\n        divisor (int): Padded image edges will be multiple to divisor.\n        pad_val (Number | Sequence[Number]): Same as :func:`impad`.\n\n    Returns:\n        ndarray: The padded image.\n    \"\"\"\n    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor\n    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor\n    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)\n\n\ndef cutout(img: np.ndarray,\n           shape: Union[int, Tuple[int, int]],\n           pad_val: Union[int, float, tuple] = 0) -> np.ndarray:\n    \"\"\"Randomly cut out a rectangle from the original img.\n\n    Args:\n        img (ndarray): Image to be cutout.\n        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a\n            int, the value will be used for both h and w.\n        pad_val (int | float | tuple[int | float]): Values to be filled in the\n            cut area. Defaults to 0.\n\n    Returns:\n        ndarray: The cutout image.\n    \"\"\"\n\n    channels = 1 if img.ndim == 2 else img.shape[2]\n    if isinstance(shape, int):\n        cut_h, cut_w = shape, shape\n    else:\n        assert isinstance(shape, tuple) and len(shape) == 2, \\\n            f'shape must be a int or a tuple with length 2, but got type ' \\\n            f'{type(shape)} instead.'\n        cut_h, cut_w = shape\n    if isinstance(pad_val, (int, float)):\n        pad_val = tuple([pad_val] * channels)\n    elif isinstance(pad_val, tuple):\n        assert len(pad_val) == channels, \\\n            'Expected the num of elements in tuple equals the channels' \\\n            'of input image. Found {} vs {}'.format(\n                len(pad_val), channels)\n    else:\n        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')\n\n    img_h, img_w = img.shape[:2]\n    y0 = np.random.uniform(img_h)\n    x0 = np.random.uniform(img_w)\n\n    y1 = int(max(0, y0 - cut_h / 2.))\n    x1 = int(max(0, x0 - cut_w / 2.))\n    y2 = min(img_h, y1 + cut_h)\n    x2 = min(img_w, x1 + cut_w)\n\n    if img.ndim == 2:\n        patch_shape = (y2 - y1, x2 - x1)\n    else:\n        patch_shape = (y2 - y1, x2 - x1, channels)  # type: ignore\n\n    img_cutout = img.copy()\n    patch = np.array(\n        pad_val, dtype=img.dtype) * np.ones(\n            patch_shape, dtype=img.dtype)\n    img_cutout[y1:y2, x1:x2, ...] = patch\n\n    return img_cutout\n\n\ndef _get_shear_matrix(magnitude: Union[int, float],\n                      direction: str = 'horizontal') -> np.ndarray:\n    \"\"\"Generate the shear matrix for transformation.\n\n    Args:\n        magnitude (int | float): The magnitude used for shear.\n        direction (str): The flip direction, either \"horizontal\"\n            or \"vertical\".\n\n    Returns:\n        ndarray: The shear matrix with dtype float32.\n    \"\"\"\n    if direction == 'horizontal':\n        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])\n    elif direction == 'vertical':\n        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])\n    return shear_matrix\n\n\ndef imshear(img: np.ndarray,\n            magnitude: Union[int, float],\n            direction: str = 'horizontal',\n            border_value: Union[int, Tuple[int, int]] = 0,\n            interpolation: str = 'bilinear') -> np.ndarray:\n    \"\"\"Shear an image.\n\n    Args:\n        img (ndarray): Image to be sheared with format (h, w)\n            or (h, w, c).\n        magnitude (int | float): The magnitude used for shear.\n        direction (str): The flip direction, either \"horizontal\"\n            or \"vertical\".\n        border_value (int | tuple[int]): Value used in case of a\n            constant border.\n        interpolation (str): Same as :func:`resize`.\n\n    Returns:\n        ndarray: The sheared image.\n    \"\"\"\n    assert direction in ['horizontal',\n                         'vertical'], f'Invalid direction: {direction}'\n    height, width = img.shape[:2]\n    if img.ndim == 2:\n        channels = 1\n    elif img.ndim == 3:\n        channels = img.shape[-1]\n    if isinstance(border_value, int):\n        border_value = tuple([border_value] * channels)  # type: ignore\n    elif isinstance(border_value, tuple):\n        assert len(border_value) == channels, \\\n            'Expected the num of elements in tuple equals the channels' \\\n            'of input image. Found {} vs {}'.format(\n                len(border_value), channels)\n    else:\n        raise ValueError(\n            f'Invalid type {type(border_value)} for `border_value`')\n    shear_matrix = _get_shear_matrix(magnitude, direction)\n    sheared = cv2.warpAffine(\n        img,\n        shear_matrix,\n        (width, height),\n        # Note case when the number elements in `border_value`\n        # greater than 3 (e.g. shearing masks whose channels large\n        # than 3) will raise TypeError in `cv2.warpAffine`.\n        # Here simply slice the first 3 values in `border_value`.\n        borderValue=border_value[:3],  # type: ignore\n        flags=cv2_interp_codes[interpolation])\n    return sheared\n\n\ndef _get_translate_matrix(offset: Union[int, float],\n                          direction: str = 'horizontal') -> np.ndarray:\n    \"\"\"Generate the translate matrix.\n\n    Args:\n        offset (int | float): The offset used for translate.\n        direction (str): The translate direction, either\n            \"horizontal\" or \"vertical\".\n\n    Returns:\n        ndarray: The translate matrix with dtype float32.\n    \"\"\"\n    if direction == 'horizontal':\n        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])\n    elif direction == 'vertical':\n        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])\n    return translate_matrix\n\n\ndef imtranslate(img: np.ndarray,\n                offset: Union[int, float],\n                direction: str = 'horizontal',\n                border_value: Union[int, tuple] = 0,\n                interpolation: str = 'bilinear') -> np.ndarray:\n    \"\"\"Translate an image.\n\n    Args:\n        img (ndarray): Image to be translated with format\n            (h, w) or (h, w, c).\n        offset (int | float): The offset used for translate.\n        direction (str): The translate direction, either \"horizontal\"\n            or \"vertical\".\n        border_value (int | tuple[int]): Value used in case of a\n            constant border.\n        interpolation (str): Same as :func:`resize`.\n\n    Returns:\n        ndarray: The translated image.\n    \"\"\"\n    assert direction in ['horizontal',\n                         'vertical'], f'Invalid direction: {direction}'\n    height, width = img.shape[:2]\n    if img.ndim == 2:\n        channels = 1\n    elif img.ndim == 3:\n        channels = img.shape[-1]\n    if isinstance(border_value, int):\n        border_value = tuple([border_value] * channels)\n    elif isinstance(border_value, tuple):\n        assert len(border_value) == channels, \\\n            'Expected the num of elements in tuple equals the channels' \\\n            'of input image. Found {} vs {}'.format(\n                len(border_value), channels)\n    else:\n        raise ValueError(\n            f'Invalid type {type(border_value)} for `border_value`.')\n    translate_matrix = _get_translate_matrix(offset, direction)\n    translated = cv2.warpAffine(\n        img,\n        translate_matrix,\n        (width, height),\n        # Note case when the number elements in `border_value`\n        # greater than 3 (e.g. translating masks whose channels\n        # large than 3) will raise TypeError in `cv2.warpAffine`.\n        # Here simply slice the first 3 values in `border_value`.\n        borderValue=border_value[:3],\n        flags=cv2_interp_codes[interpolation])\n    return translated\n"
  },
  {
    "path": "mmcv/image/io.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport io\nimport os.path as osp\nimport warnings\nfrom pathlib import Path\nfrom typing import Optional, Union\n\nimport cv2\nimport mmengine.fileio as fileio\nimport numpy as np\nfrom cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,\n                 IMREAD_UNCHANGED)\nfrom mmengine.utils import is_filepath, is_str\n\ntry:\n    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG\nexcept ImportError:\n    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None\n\ntry:\n    from PIL import Image, ImageOps\nexcept ImportError:\n    Image = None\n\ntry:\n    import tifffile\nexcept ImportError:\n    tifffile = None\n\njpeg = None\nsupported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']\n\nimread_flags = {\n    'color': IMREAD_COLOR,\n    'grayscale': IMREAD_GRAYSCALE,\n    'unchanged': IMREAD_UNCHANGED,\n    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,\n    'grayscale_ignore_orientation':\n    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE\n}\n\nimread_backend = 'cv2'\n\n\ndef use_backend(backend: str) -> None:\n    \"\"\"Select a backend for image decoding.\n\n    Args:\n        backend (str): The image decoding backend type. Options are `cv2`,\n        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)\n        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`\n        file format.\n    \"\"\"\n    assert backend in supported_backends\n    global imread_backend\n    imread_backend = backend\n    if imread_backend == 'turbojpeg':\n        if TurboJPEG is None:\n            raise ImportError('`PyTurboJPEG` is not installed')\n        global jpeg\n        if jpeg is None:\n            jpeg = TurboJPEG()\n    elif imread_backend == 'pillow':\n        if Image is None:\n            raise ImportError('`Pillow` is not installed')\n    elif imread_backend == 'tifffile':\n        if tifffile is None:\n            raise ImportError('`tifffile` is not installed')\n\n\ndef _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):\n    channel_order = channel_order.lower()\n    if channel_order not in ['rgb', 'bgr']:\n        raise ValueError('channel order must be either \"rgb\" or \"bgr\"')\n\n    if flag == 'color':\n        if channel_order == 'bgr':\n            return TJPF_BGR\n        elif channel_order == 'rgb':\n            return TJCS_RGB\n    elif flag == 'grayscale':\n        return TJPF_GRAY\n    else:\n        raise ValueError('flag must be \"color\" or \"grayscale\"')\n\n\ndef _pillow2array(img,\n                  flag: str = 'color',\n                  channel_order: str = 'bgr') -> np.ndarray:\n    \"\"\"Convert a pillow image to numpy array.\n\n    Args:\n        img (:obj:`PIL.Image.Image`): The image loaded using PIL\n        flag (str): Flags specifying the color type of a loaded image,\n            candidates are 'color', 'grayscale' and 'unchanged'.\n            Default to 'color'.\n        channel_order (str): The channel order of the output image array,\n            candidates are 'bgr' and 'rgb'. Default to 'bgr'.\n\n    Returns:\n        np.ndarray: The converted numpy array\n    \"\"\"\n    channel_order = channel_order.lower()\n    if channel_order not in ['rgb', 'bgr']:\n        raise ValueError('channel order must be either \"rgb\" or \"bgr\"')\n\n    if flag == 'unchanged':\n        array = np.array(img)\n        if array.ndim >= 3 and array.shape[2] >= 3:  # color image\n            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR\n    else:\n        # Handle exif orientation tag\n        if flag in ['color', 'grayscale']:\n            img = ImageOps.exif_transpose(img)\n        # If the image mode is not 'RGB', convert it to 'RGB' first.\n        if img.mode != 'RGB':\n            if img.mode != 'LA':\n                # Most formats except 'LA' can be directly converted to RGB\n                img = img.convert('RGB')\n            else:\n                # When the mode is 'LA', the default conversion will fill in\n                #  the canvas with black, which sometimes shadows black objects\n                #  in the foreground.\n                #\n                # Therefore, a random color (124, 117, 104) is used for canvas\n                img_rgba = img.convert('RGBA')\n                img = Image.new('RGB', img_rgba.size, (124, 117, 104))\n                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha\n        if flag in ['color', 'color_ignore_orientation']:\n            array = np.array(img)\n            if channel_order != 'rgb':\n                array = array[:, :, ::-1]  # RGB to BGR\n        elif flag in ['grayscale', 'grayscale_ignore_orientation']:\n            img = img.convert('L')\n            array = np.array(img)\n        else:\n            raise ValueError(\n                'flag must be \"color\", \"grayscale\", \"unchanged\", '\n                f'\"color_ignore_orientation\" or \"grayscale_ignore_orientation\"'\n                f' but got {flag}')\n    return array\n\n\ndef imread(img_or_path: Union[np.ndarray, str, Path],\n           flag: str = 'color',\n           channel_order: str = 'bgr',\n           backend: Optional[str] = None,\n           file_client_args: Optional[dict] = None,\n           *,\n           backend_args: Optional[dict] = None) -> np.ndarray:\n    \"\"\"Read an image.\n\n    Args:\n        img_or_path (ndarray or str or Path): Either a numpy array or str or\n            pathlib.Path. If it is a numpy array (loaded image), then\n            it will be returned as is.\n        flag (str): Flags specifying the color type of a loaded image,\n            candidates are `color`, `grayscale`, `unchanged`,\n            `color_ignore_orientation` and `grayscale_ignore_orientation`.\n            By default, `cv2` and `pillow` backend would rotate the image\n            according to its EXIF info unless called with `unchanged` or\n            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend\n            always ignore image's EXIF info regardless of the flag.\n            The `turbojpeg` backend only supports `color` and `grayscale`.\n        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.\n        backend (str | None): The image decoding backend type. Options are\n            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.\n            If backend is None, the global imread_backend specified by\n            ``mmcv.use_backend()`` will be used. Default: None.\n        file_client_args (dict, optional): Arguments to instantiate a\n            FileClient. See :class:`mmengine.fileio.FileClient` for details.\n            Default: None. It will be deprecated in future. Please use\n            ``backend_args`` instead.\n            Deprecated in version 2.0.0rc4.\n        backend_args (dict, optional): Instantiates the corresponding file\n            backend. It may contain `backend` key to specify the file\n            backend. If it contains, the file backend corresponding to this\n            value will be used and initialized with the remaining values,\n            otherwise the corresponding file backend will be selected\n            based on the prefix of the file path. Defaults to None.\n            New in version 2.0.0rc4.\n\n    Returns:\n        ndarray: Loaded image array.\n\n    Examples:\n        >>> import mmcv\n        >>> img_path = '/path/to/img.jpg'\n        >>> img = mmcv.imread(img_path)\n        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',\n        ...     backend='cv2')\n        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',\n        ...     backend='pillow')\n        >>> s3_img_path = 's3://bucket/img.jpg'\n        >>> # infer the file backend by the prefix s3\n        >>> img = mmcv.imread(s3_img_path)\n        >>> # manually set the file backend petrel\n        >>> img = mmcv.imread(s3_img_path, backend_args={\n        ...     'backend': 'petrel'})\n        >>> http_img_path = 'http://path/to/img.jpg'\n        >>> img = mmcv.imread(http_img_path)\n        >>> img = mmcv.imread(http_img_path, backend_args={\n        ...     'backend': 'http'})\n    \"\"\"\n    if file_client_args is not None:\n        warnings.warn(\n            '\"file_client_args\" will be deprecated in future. '\n            'Please use \"backend_args\" instead', DeprecationWarning)\n        if backend_args is not None:\n            raise ValueError(\n                '\"file_client_args\" and \"backend_args\" cannot be set at the '\n                'same time.')\n\n    if isinstance(img_or_path, Path):\n        img_or_path = str(img_or_path)\n\n    if isinstance(img_or_path, np.ndarray):\n        return img_or_path\n    elif is_str(img_or_path):\n        if file_client_args is not None:\n            file_client = fileio.FileClient.infer_client(\n                file_client_args, img_or_path)\n            img_bytes = file_client.get(img_or_path)\n        else:\n            img_bytes = fileio.get(img_or_path, backend_args=backend_args)\n        return imfrombytes(img_bytes, flag, channel_order, backend)\n    else:\n        raise TypeError('\"img\" must be a numpy array or a str or '\n                        'a pathlib.Path object')\n\n\ndef imfrombytes(content: bytes,\n                flag: str = 'color',\n                channel_order: str = 'bgr',\n                backend: Optional[str] = None) -> np.ndarray:\n    \"\"\"Read an image from bytes.\n\n    Args:\n        content (bytes): Image bytes got from files or other streams.\n        flag (str): Same as :func:`imread`.\n        channel_order (str): The channel order of the output, candidates\n            are 'bgr' and 'rgb'. Default to 'bgr'.\n        backend (str | None): The image decoding backend type. Options are\n            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is\n            None, the global imread_backend specified by ``mmcv.use_backend()``\n            will be used. Default: None.\n\n    Returns:\n        ndarray: Loaded image array.\n\n    Examples:\n        >>> img_path = '/path/to/img.jpg'\n        >>> with open(img_path, 'rb') as f:\n        >>>     img_buff = f.read()\n        >>> img = mmcv.imfrombytes(img_buff)\n        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')\n        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')\n        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')\n    \"\"\"\n\n    if backend is None:\n        backend = imread_backend\n    if backend not in supported_backends:\n        raise ValueError(\n            f'backend: {backend} is not supported. Supported '\n            \"backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'\")\n    if backend == 'turbojpeg':\n        img = jpeg.decode(  # type: ignore\n            content, _jpegflag(flag, channel_order))\n        if img.shape[-1] == 1:\n            img = img[:, :, 0]\n        return img\n    elif backend == 'pillow':\n        with io.BytesIO(content) as buff:\n            img = Image.open(buff)\n            img = _pillow2array(img, flag, channel_order)\n        return img\n    elif backend == 'tifffile':\n        with io.BytesIO(content) as buff:\n            img = tifffile.imread(buff)\n        return img\n    else:\n        img_np = np.frombuffer(content, np.uint8)\n        flag = imread_flags[flag] if is_str(flag) else flag\n        img = cv2.imdecode(img_np, flag)\n        if flag == IMREAD_COLOR and channel_order == 'rgb':\n            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)\n        return img\n\n\ndef imwrite(img: np.ndarray,\n            file_path: str,\n            params: Optional[list] = None,\n            auto_mkdir: Optional[bool] = None,\n            file_client_args: Optional[dict] = None,\n            *,\n            backend_args: Optional[dict] = None) -> bool:\n    \"\"\"Write image to file.\n\n    Warning:\n        The parameter `auto_mkdir` will be deprecated in the future and every\n        file clients will make directory automatically.\n\n    Args:\n        img (ndarray): Image array to be written.\n        file_path (str): Image file path.\n        params (None or list): Same as opencv :func:`imwrite` interface.\n        auto_mkdir (bool): If the parent folder of `file_path` does not exist,\n            whether to create it automatically. It will be deprecated.\n        file_client_args (dict, optional): Arguments to instantiate a\n            FileClient. See :class:`mmengine.fileio.FileClient` for details.\n            Default: None. It will be deprecated in future. Please use\n            ``backend_args`` instead.\n            Deprecated in version 2.0.0rc4.\n        backend_args (dict, optional): Instantiates the corresponding file\n            backend. It may contain `backend` key to specify the file\n            backend. If it contains, the file backend corresponding to this\n            value will be used and initialized with the remaining values,\n            otherwise the corresponding file backend will be selected\n            based on the prefix of the file path. Defaults to None.\n            New in version 2.0.0rc4.\n\n    Returns:\n        bool: Successful or not.\n\n    Examples:\n        >>> # write to hard disk client\n        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')\n        >>> # infer the file backend by the prefix s3\n        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')\n        >>> # manually set the file backend petrel\n        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={\n        ...     'backend': 'petrel'})\n    \"\"\"\n    if file_client_args is not None:\n        warnings.warn(\n            '\"file_client_args\" will be deprecated in future. '\n            'Please use \"backend_args\" instead', DeprecationWarning)\n        if backend_args is not None:\n            raise ValueError(\n                '\"file_client_args\" and \"backend_args\" cannot be set at the '\n                'same time.')\n\n    assert is_filepath(file_path)\n    file_path = str(file_path)\n    if auto_mkdir is not None:\n        warnings.warn(\n            'The parameter `auto_mkdir` will be deprecated in the future and '\n            'every file clients will make directory automatically.')\n\n    img_ext = osp.splitext(file_path)[-1]\n    # Encode image according to image suffix.\n    # For example, if image path is '/path/your/img.jpg', the encode\n    # format is '.jpg'.\n    flag, img_buff = cv2.imencode(img_ext, img, params)\n\n    if file_client_args is not None:\n        file_client = fileio.FileClient.infer_client(file_client_args,\n                                                     file_path)\n        file_client.put(img_buff.tobytes(), file_path)\n    else:\n        fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)\n\n    return flag\n"
  },
  {
    "path": "mmcv/image/misc.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional\n\nimport numpy as np\n\nimport mmcv\n\ntry:\n    import torch\nexcept ImportError:\n    torch = None\n\n\ndef tensor2imgs(tensor,\n                mean: Optional[tuple] = None,\n                std: Optional[tuple] = None,\n                to_rgb: bool = True) -> list:\n    \"\"\"Convert tensor to 3-channel images or 1-channel gray images.\n\n    Args:\n        tensor (torch.Tensor): Tensor that contains multiple images, shape (\n            N, C, H, W). :math:`C` can be either 3 or 1.\n        mean (tuple[float], optional): Mean of images. If None,\n            (0, 0, 0) will be used for tensor with 3-channel,\n            while (0, ) for tensor with 1-channel. Defaults to None.\n        std (tuple[float], optional): Standard deviation of images. If None,\n            (1, 1, 1) will be used for tensor with 3-channel,\n            while (1, ) for tensor with 1-channel. Defaults to None.\n        to_rgb (bool, optional): Whether the tensor was converted to RGB\n            format in the first place. If so, convert it back to BGR.\n            For the tensor with 1 channel, it must be False. Defaults to True.\n\n    Returns:\n        list[np.ndarray]: A list that contains multiple images.\n    \"\"\"\n\n    if torch is None:\n        raise RuntimeError('pytorch is not installed')\n    assert torch.is_tensor(tensor) and tensor.ndim == 4\n    channels = tensor.size(1)\n    assert channels in [1, 3]\n    if mean is None:\n        mean = (0, ) * channels\n    if std is None:\n        std = (1, ) * channels\n    assert (channels == len(mean) == len(std) == 3) or \\\n        (channels == len(mean) == len(std) == 1 and not to_rgb)\n\n    num_imgs = tensor.size(0)\n    mean = np.array(mean, dtype=np.float32)\n    std = np.array(std, dtype=np.float32)\n    imgs = []\n    for img_id in range(num_imgs):\n        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)\n        img = mmcv.imdenormalize(\n            img, mean, std, to_bgr=to_rgb).astype(np.uint8)\n        imgs.append(np.ascontiguousarray(img))\n    return imgs\n"
  },
  {
    "path": "mmcv/image/photometric.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom typing import Optional\n\nimport cv2\nimport numpy as np\nfrom mmengine.utils import is_tuple_of\nfrom PIL import Image, ImageEnhance\n\nfrom .colorspace import bgr2gray, gray2bgr\nfrom .io import imread_backend\n\n\ndef imnormalize(img, mean, std, to_rgb=True):\n    \"\"\"Normalize an image with mean and std.\n\n    Args:\n        img (ndarray): Image to be normalized.\n        mean (ndarray): The mean to be used for normalize.\n        std (ndarray): The std to be used for normalize.\n        to_rgb (bool): Whether to convert to rgb.\n\n    Returns:\n        ndarray: The normalized image.\n    \"\"\"\n    img = img.copy().astype(np.float32)\n    return imnormalize_(img, mean, std, to_rgb)\n\n\ndef imnormalize_(img, mean, std, to_rgb=True):\n    \"\"\"Inplace normalize an image with mean and std.\n\n    Args:\n        img (ndarray): Image to be normalized.\n        mean (ndarray): The mean to be used for normalize.\n        std (ndarray): The std to be used for normalize.\n        to_rgb (bool): Whether to convert to rgb.\n\n    Returns:\n        ndarray: The normalized image.\n    \"\"\"\n    # cv2 inplace normalization does not accept uint8\n    assert img.dtype != np.uint8\n    mean = np.float64(mean.reshape(1, -1))\n    stdinv = 1 / np.float64(std.reshape(1, -1))\n    if to_rgb:\n        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace\n    cv2.subtract(img, mean, img)  # inplace\n    cv2.multiply(img, stdinv, img)  # inplace\n    return img\n\n\ndef imdenormalize(img, mean, std, to_bgr=True):\n    assert img.dtype != np.uint8\n    mean = mean.reshape(1, -1).astype(np.float64)\n    std = std.reshape(1, -1).astype(np.float64)\n    img = cv2.multiply(img, std)  # make a copy\n    cv2.add(img, mean, img)  # inplace\n    if to_bgr:\n        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace\n    return img\n\n\ndef iminvert(img):\n    \"\"\"Invert (negate) an image.\n\n    Args:\n        img (ndarray): Image to be inverted.\n\n    Returns:\n        ndarray: The inverted image.\n    \"\"\"\n    return np.full_like(img, 255) - img\n\n\ndef solarize(img, thr=128):\n    \"\"\"Solarize an image (invert all pixel values above a threshold)\n\n    Args:\n        img (ndarray): Image to be solarized.\n        thr (int): Threshold for solarizing (0 - 255).\n\n    Returns:\n        ndarray: The solarized image.\n    \"\"\"\n    img = np.where(img < thr, img, 255 - img)\n    return img\n\n\ndef posterize(img, bits):\n    \"\"\"Posterize an image (reduce the number of bits for each color channel)\n\n    Args:\n        img (ndarray): Image to be posterized.\n        bits (int): Number of bits (1 to 8) to use for posterizing.\n\n    Returns:\n        ndarray: The posterized image.\n    \"\"\"\n    shift = 8 - bits\n    img = np.left_shift(np.right_shift(img, shift), shift)\n    return img\n\n\ndef adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):\n    r\"\"\"It blends the source image and its gray image:\n\n    .. math::\n        output = img * alpha + gray\\_img * beta + gamma\n\n    Args:\n        img (ndarray): The input source image.\n        alpha (int | float): Weight for the source image. Default 1.\n        beta (int | float): Weight for the converted gray image.\n            If None, it's assigned the value (1 - `alpha`).\n        gamma (int | float): Scalar added to each sum.\n            Same as :func:`cv2.addWeighted`. Default 0.\n        backend (str | None): The image processing backend type. Options are\n            `cv2`, `pillow`, `None`. If backend is None, the global\n            ``imread_backend`` specified by ``mmcv.use_backend()`` will be\n            used. Defaults to None.\n\n    Returns:\n        ndarray: Colored image which has the same size and dtype as input.\n    \"\"\"\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        warnings.warn(\"Only use 'alpha' for pillow backend.\")\n        # Image.fromarray defaultly supports RGB, not BGR.\n        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')\n        enhancer = ImageEnhance.Color(pil_image)\n        pil_image = enhancer.enhance(alpha)\n        return np.array(pil_image, dtype=img.dtype)[..., ::-1]\n    else:\n        gray_img = bgr2gray(img)\n        gray_img = np.tile(gray_img[..., None], [1, 1, 3])\n        if beta is None:\n            beta = 1 - alpha\n        colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)\n        if not colored_img.dtype == np.uint8:\n            # Note when the dtype of `img` is not the default `np.uint8`\n            # (e.g. np.float32), the value in `colored_img` got from cv2\n            # is not guaranteed to be in range [0, 255], so here clip\n            # is needed.\n            colored_img = np.clip(colored_img, 0, 255)\n        return colored_img.astype(img.dtype)\n\n\ndef imequalize(img):\n    \"\"\"Equalize the image histogram.\n\n    This function applies a non-linear mapping to the input image,\n    in order to create a uniform distribution of grayscale values\n    in the output image.\n\n    Args:\n        img (ndarray): Image to be equalized.\n\n    Returns:\n        ndarray: The equalized image.\n    \"\"\"\n\n    def _scale_channel(im, c):\n        \"\"\"Scale the data in the corresponding channel.\"\"\"\n        im = im[:, :, c]\n        # Compute the histogram of the image channel.\n        histo = np.histogram(im, 256, (0, 255))[0]\n        # For computing the step, filter out the nonzeros.\n        nonzero_histo = histo[histo > 0]\n        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255\n        if not step:\n            lut = np.array(range(256))\n        else:\n            # Compute the cumulative sum, shifted by step // 2\n            # and then normalized by step.\n            lut = (np.cumsum(histo) + (step // 2)) // step\n            # Shift lut, prepending with 0.\n            lut = np.concatenate([[0], lut[:-1]], 0)\n            # handle potential integer overflow\n            lut[lut > 255] = 255\n        # If step is zero, return the original image.\n        # Otherwise, index from lut.\n        return np.where(np.equal(step, 0), im, lut[im])\n\n    # Scales each channel independently and then stacks\n    # the result.\n    s1 = _scale_channel(img, 0)\n    s2 = _scale_channel(img, 1)\n    s3 = _scale_channel(img, 2)\n    equalized_img = np.stack([s1, s2, s3], axis=-1)\n    return equalized_img.astype(img.dtype)\n\n\ndef adjust_brightness(img, factor=1., backend=None):\n    \"\"\"Adjust image brightness.\n\n    This function controls the brightness of an image. An\n    enhancement factor of 0.0 gives a black image.\n    A factor of 1.0 gives the original image. This function\n    blends the source image and the degenerated black image:\n\n    .. math::\n        output = img * factor + degenerated * (1 - factor)\n\n    Args:\n        img (ndarray): Image to be brightened.\n        factor (float): A value controls the enhancement.\n            Factor 1.0 returns the original image, lower\n            factors mean less color (brightness, contrast,\n            etc), and higher values more. Default 1.\n        backend (str | None): The image processing backend type. Options are\n            `cv2`, `pillow`, `None`. If backend is None, the global\n            ``imread_backend`` specified by ``mmcv.use_backend()`` will be\n            used. Defaults to None.\n\n    Returns:\n        ndarray: The brightened image.\n    \"\"\"\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        # Image.fromarray defaultly supports RGB, not BGR.\n        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')\n        enhancer = ImageEnhance.Brightness(pil_image)\n        pil_image = enhancer.enhance(factor)\n        return np.array(pil_image, dtype=img.dtype)[..., ::-1]\n    else:\n        degenerated = np.zeros_like(img)\n        # Note manually convert the dtype to np.float32, to\n        # achieve as close results as PIL.ImageEnhance.Brightness.\n        # Set beta=1-factor, and gamma=0\n        brightened_img = cv2.addWeighted(\n            img.astype(np.float32), factor, degenerated.astype(np.float32),\n            1 - factor, 0)\n        brightened_img = np.clip(brightened_img, 0, 255)\n        return brightened_img.astype(img.dtype)\n\n\ndef adjust_contrast(img, factor=1., backend=None):\n    \"\"\"Adjust image contrast.\n\n    This function controls the contrast of an image. An\n    enhancement factor of 0.0 gives a solid grey\n    image. A factor of 1.0 gives the original image. It\n    blends the source image and the degenerated mean image:\n\n    .. math::\n        output = img * factor + degenerated * (1 - factor)\n\n    Args:\n        img (ndarray): Image to be contrasted. BGR order.\n        factor (float): Same as :func:`mmcv.adjust_brightness`.\n        backend (str | None): The image processing backend type. Options are\n            `cv2`, `pillow`, `None`. If backend is None, the global\n            ``imread_backend`` specified by ``mmcv.use_backend()`` will be\n            used. Defaults to None.\n\n    Returns:\n        ndarray: The contrasted image.\n    \"\"\"\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        # Image.fromarray defaultly supports RGB, not BGR.\n        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')\n        enhancer = ImageEnhance.Contrast(pil_image)\n        pil_image = enhancer.enhance(factor)\n        return np.array(pil_image, dtype=img.dtype)[..., ::-1]\n    else:\n        gray_img = bgr2gray(img)\n        hist = np.histogram(gray_img, 256, (0, 255))[0]\n        mean = round(np.sum(gray_img) / np.sum(hist))\n        degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)\n        degenerated = gray2bgr(degenerated)\n        contrasted_img = cv2.addWeighted(\n            img.astype(np.float32), factor, degenerated.astype(np.float32),\n            1 - factor, 0)\n        contrasted_img = np.clip(contrasted_img, 0, 255)\n        return contrasted_img.astype(img.dtype)\n\n\ndef auto_contrast(img, cutoff=0):\n    \"\"\"Auto adjust image contrast.\n\n    This function maximize (normalize) image contrast by first removing cutoff\n    percent of the lightest and darkest pixels from the histogram and remapping\n    the image so that the darkest pixel becomes black (0), and the lightest\n    becomes white (255).\n\n    Args:\n        img (ndarray): Image to be contrasted. BGR order.\n        cutoff (int | float | tuple): The cutoff percent of the lightest and\n            darkest pixels to be removed. If given as tuple, it shall be\n            (low, high). Otherwise, the single value will be used for both.\n            Defaults to 0.\n\n    Returns:\n        ndarray: The contrasted image.\n    \"\"\"\n\n    def _auto_contrast_channel(im, c, cutoff):\n        im = im[:, :, c]\n        # Compute the histogram of the image channel.\n        histo = np.histogram(im, 256, (0, 255))[0]\n        # Remove cut-off percent pixels from histo\n        histo_sum = np.cumsum(histo)\n        cut_low = histo_sum[-1] * cutoff[0] // 100\n        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100\n        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low\n        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)\n\n        # Compute mapping\n        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]\n        # If all the values have been cut off, return the origin img\n        if low >= high:\n            return im\n        scale = 255.0 / (high - low)\n        offset = -low * scale\n        lut = np.array(range(256))\n        lut = lut * scale + offset\n        lut = np.clip(lut, 0, 255)\n        return lut[im]\n\n    if isinstance(cutoff, (int, float)):\n        cutoff = (cutoff, cutoff)\n    else:\n        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \\\n            f'float or tuple, but got {type(cutoff)} instead.'\n    # Auto adjusts contrast for each channel independently and then stacks\n    # the result.\n    s1 = _auto_contrast_channel(img, 0, cutoff)\n    s2 = _auto_contrast_channel(img, 1, cutoff)\n    s3 = _auto_contrast_channel(img, 2, cutoff)\n    contrasted_img = np.stack([s1, s2, s3], axis=-1)\n    return contrasted_img.astype(img.dtype)\n\n\ndef adjust_sharpness(img, factor=1., kernel=None):\n    \"\"\"Adjust image sharpness.\n\n    This function controls the sharpness of an image. An\n    enhancement factor of 0.0 gives a blurred image. A\n    factor of 1.0 gives the original image. And a factor\n    of 2.0 gives a sharpened image. It blends the source\n    image and the degenerated mean image:\n\n    .. math::\n        output = img * factor + degenerated * (1 - factor)\n\n    Args:\n        img (ndarray): Image to be sharpened. BGR order.\n        factor (float): Same as :func:`mmcv.adjust_brightness`.\n        kernel (np.ndarray, optional): Filter kernel to be applied on the img\n            to obtain the degenerated img. Defaults to None.\n\n    Note:\n        No value sanity check is enforced on the kernel set by users. So with\n        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform\n        the function its name indicates but end up performing whatever\n        transform determined by the kernel.\n\n    Returns:\n        ndarray: The sharpened image.\n    \"\"\"\n\n    if kernel is None:\n        # adopted from PIL.ImageFilter.SMOOTH\n        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13\n    assert isinstance(kernel, np.ndarray), \\\n        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'\n    assert kernel.ndim == 2, \\\n        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'\n\n    degenerated = cv2.filter2D(img, -1, kernel)\n    sharpened_img = cv2.addWeighted(\n        img.astype(np.float32), factor, degenerated.astype(np.float32),\n        1 - factor, 0)\n    sharpened_img = np.clip(sharpened_img, 0, 255)\n    return sharpened_img.astype(img.dtype)\n\n\ndef adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):\n    \"\"\"AlexNet-style PCA jitter.\n\n    This data augmentation is proposed in `ImageNet Classification with Deep\n    Convolutional Neural Networks\n    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.\n\n    Args:\n        img (ndarray): Image to be adjusted lighting. BGR order.\n        eigval (ndarray): the eigenvalue of the convariance matrix of pixel\n            values, respectively.\n        eigvec (ndarray): the eigenvector of the convariance matrix of pixel\n            values, respectively.\n        alphastd (float): The standard deviation for distribution of alpha.\n            Defaults to 0.1\n        to_rgb (bool): Whether to convert img to rgb.\n\n    Returns:\n        ndarray: The adjusted image.\n    \"\"\"\n    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \\\n        f'eigval and eigvec should both be of type np.ndarray, got ' \\\n        f'{type(eigval)} and {type(eigvec)} instead.'\n\n    assert eigval.ndim == 1 and eigvec.ndim == 2\n    assert eigvec.shape == (3, eigval.shape[0])\n    n_eigval = eigval.shape[0]\n    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \\\n        f'got {type(alphastd)} instead.'\n\n    img = img.copy().astype(np.float32)\n    if to_rgb:\n        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace\n\n    alpha = np.random.normal(0, alphastd, n_eigval)\n    alter = eigvec \\\n        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \\\n        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))\n    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)\n    img_adjusted = img + alter\n    return img_adjusted\n\n\ndef lut_transform(img, lut_table):\n    \"\"\"Transform array by look-up table.\n\n    The function lut_transform fills the output array with values from the\n    look-up table. Indices of the entries are taken from the input array.\n\n    Args:\n        img (ndarray): Image to be transformed.\n        lut_table (ndarray): look-up table of 256 elements; in case of\n            multi-channel input array, the table should either have a single\n            channel (in this case the same table is used for all channels) or\n            the same number of channels as in the input array.\n\n    Returns:\n        ndarray: The transformed image.\n    \"\"\"\n    assert isinstance(img, np.ndarray)\n    assert 0 <= np.min(img) and np.max(img) <= 255\n    assert isinstance(lut_table, np.ndarray)\n    assert lut_table.shape == (256, )\n\n    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)\n\n\ndef clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):\n    \"\"\"Use CLAHE method to process the image.\n\n    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].\n    Graphics Gems, 1994:474-485.` for more information.\n\n    Args:\n        img (ndarray): Image to be processed.\n        clip_limit (float): Threshold for contrast limiting. Default: 40.0.\n        tile_grid_size (tuple[int]): Size of grid for histogram equalization.\n            Input image will be divided into equally sized rectangular tiles.\n            It defines the number of tiles in row and column. Default: (8, 8).\n\n    Returns:\n        ndarray: The processed image.\n    \"\"\"\n    assert isinstance(img, np.ndarray)\n    assert img.ndim == 2\n    assert isinstance(clip_limit, (float, int))\n    assert is_tuple_of(tile_grid_size, int)\n    assert len(tile_grid_size) == 2\n\n    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)\n    return clahe.apply(np.array(img, dtype=np.uint8))\n\n\ndef adjust_hue(img: np.ndarray,\n               hue_factor: float,\n               backend: Optional[str] = None) -> np.ndarray:\n    \"\"\"Adjust hue of an image.\n\n    The image hue is adjusted by converting the image to HSV and cyclically\n    shifting the intensities in the hue channel (H). The image is then\n    converted back to original image mode.\n\n    `hue_factor` is the amount of shift in H channel and must be in the\n    interval `[-0.5, 0.5]`.\n\n    Modified from\n    https://github.com/pytorch/vision/blob/main/torchvision/\n    transforms/functional.py\n\n    Args:\n        img (ndarray): Image to be adjusted.\n        hue_factor (float):  How much to shift the hue channel. Should be in\n            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in\n            HSV space in positive and negative direction respectively.\n            0 means no shift. Therefore, both -0.5 and 0.5 will give an image\n            with complementary colors while 0 gives the original image.\n        backend (str | None): The image processing backend type. Options are\n            `cv2`, `pillow`, `None`. If backend is None, the global\n            ``imread_backend`` specified by ``mmcv.use_backend()`` will be\n            used. Defaults to None.\n\n    Returns:\n        ndarray: Hue adjusted image.\n    \"\"\"\n    if backend is None:\n        backend = imread_backend\n    if backend not in ['cv2', 'pillow']:\n        raise ValueError(f'backend: {backend} is not supported.'\n                         f\"Supported backends are 'cv2', 'pillow'\")\n\n    if not (-0.5 <= hue_factor <= 0.5):\n        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')\n    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):\n        raise TypeError('img should be ndarray with dim=[2 or 3].')\n\n    if backend == 'pillow':\n        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'\n        # Image.fromarray defaultly supports RGB, not BGR.\n        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')\n        input_mode = pil_image.mode\n        if input_mode in {'L', '1', 'I', 'F'}:\n            return pil_image\n\n        h, s, v = pil_image.convert('HSV').split()\n\n        np_h = np.array(h, dtype=np.uint8)\n        # uint8 addition take cares of rotation across boundaries\n        with np.errstate(over='ignore'):\n            np_h += np.uint8(hue_factor * 255)\n        h = Image.fromarray(np_h, 'L')\n\n        pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)\n        return np.array(pil_image, dtype=img.dtype)[..., ::-1]\n    else:\n        dtype = img.dtype\n        img = img.astype(np.uint8)\n        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)\n        h, s, v = cv2.split(hsv_img)\n        h = h.astype(np.uint8)\n        # uint8 addition take cares of rotation across boundaries\n        with np.errstate(over='ignore'):\n            h += np.uint8(hue_factor * 255)\n        hsv_img = cv2.merge([h, s, v])\n        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)\n"
  },
  {
    "path": "mmcv/ops/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom mmcv.utils import IS_MLU_AVAILABLE\nfrom .active_rotated_filter import active_rotated_filter\nfrom .assign_score_withk import assign_score_withk\nfrom .ball_query import ball_query\nfrom .bbox import bbox_overlaps\nfrom .bezier_align import BezierAlign, bezier_align\nfrom .bias_act import bias_act\nfrom .border_align import BorderAlign, border_align\nfrom .box_iou_quadri import box_iou_quadri\nfrom .box_iou_rotated import box_iou_rotated\nfrom .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive\nfrom .cc_attention import CrissCrossAttention\nfrom .chamfer_distance import chamfer_distance\nfrom .contour_expand import contour_expand\nfrom .conv2d_gradfix import conv2d, conv_transpose2d\nfrom .convex_iou import convex_giou, convex_iou\nfrom .corner_pool import CornerPool\nfrom .correlation import Correlation\nfrom .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d\nfrom .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,\n                              ModulatedDeformRoIPoolPack, deform_roi_pool)\nfrom .deprecated_wrappers import Conv2d_deprecated as Conv2d\nfrom .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d\nfrom .deprecated_wrappers import Linear_deprecated as Linear\nfrom .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d\nfrom .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d\nfrom .filtered_lrelu import filtered_lrelu\nfrom .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,\n                         sigmoid_focal_loss, softmax_focal_loss)\nfrom .furthest_point_sample import (furthest_point_sample,\n                                    furthest_point_sample_with_dist)\nfrom .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu\nfrom .gather_points import gather_points\nfrom .group_points import GroupAll, QueryAndGroup, grouping_operation\nfrom .info import get_compiler_version, get_compiling_cuda_version\nfrom .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,\n                    nms3d_normal, nms_bev, nms_normal_bev)\nfrom .knn import knn\nfrom .masked_conv import MaskedConv2d, masked_conv2d\nfrom .min_area_polygons import min_area_polygons\nfrom .modulated_deform_conv import (ModulatedDeformConv2d,\n                                    ModulatedDeformConv2dPack,\n                                    modulated_deform_conv2d)\nfrom .multi_scale_deform_attn import MultiScaleDeformableAttention\nfrom .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms\nfrom .pixel_group import pixel_group\nfrom .point_sample import (SimpleRoIAlign, point_sample,\n                           rel_roi_point_to_rel_img_point)\nfrom .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,\n                              points_in_boxes_part)\nfrom .points_in_polygons import points_in_polygons\nfrom .points_sampler import PointsSampler\nfrom .prroi_pool import PrRoIPool, prroi_pool\nfrom .psa_mask import PSAMask\nfrom .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated\nfrom .roi_align import RoIAlign, roi_align\nfrom .roi_align_rotated import RoIAlignRotated, roi_align_rotated\nfrom .roi_pool import RoIPool, roi_pool\nfrom .roiaware_pool3d import RoIAwarePool3d\nfrom .roipoint_pool3d import RoIPointPool3d\nfrom .rotated_feature_align import rotated_feature_align\nfrom .saconv import SAConv2d\nfrom .scatter_points import DynamicScatter, dynamic_scatter\nfrom .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,\n                          SparseConvTranspose3d, SparseInverseConv2d,\n                          SparseInverseConv3d, SubMConv2d, SubMConv3d)\nfrom .sparse_modules import SparseModule, SparseSequential\nfrom .sparse_pool import SparseMaxPool2d, SparseMaxPool3d\nfrom .sparse_structure import SparseConvTensor, scatter_nd\nfrom .sync_bn import SyncBatchNorm\nfrom .three_interpolate import three_interpolate\nfrom .three_nn import three_nn\nfrom .tin_shift import TINShift, tin_shift\nfrom .upfirdn2d import filter2d, upfirdn2d, upsample2d\nfrom .voxelize import Voxelization, voxelization\n\n__all__ = [\n    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',\n    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',\n    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',\n    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',\n    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',\n    'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',\n    'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',\n    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',\n    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',\n    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',\n    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',\n    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',\n    'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',\n    'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',\n    'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',\n    'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',\n    'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',\n    'contour_expand', 'three_nn', 'three_interpolate',\n    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',\n    'gather_points', 'furthest_point_sample', 'nms_quadri',\n    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',\n    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',\n    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',\n    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',\n    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',\n    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',\n    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',\n    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',\n    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',\n    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',\n    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',\n    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',\n    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'\n]\n\nif IS_MLU_AVAILABLE:\n    from .deform_conv import DeformConv2dPack_MLU  # noqa:F401\n    from .modulated_deform_conv import \\\n        ModulatedDeformConv2dPack_MLU  # noqa:F401\n    __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])\n"
  },
  {
    "path": "mmcv/ops/active_rotated_filter.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple\n\nimport torch\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext',\n    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])\n\n\nclass ActiveRotatedFilterFunction(Function):\n    \"\"\"Encoding the orientation information and generating orientation-\n    sensitive features.\n\n    The details are described in the paper\n    `Align Deep Features for Oriented Object Detection  <https://arxiv.org/abs/2008.09397>_`.\n    \"\"\"  # noqa: E501\n\n    @staticmethod\n    def forward(ctx, input: torch.Tensor,\n                indices: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            input (torch.Tensor): Input features with shape\n                [num_output_planes, num_input_planes, num_orientations, H, W].\n            indices (torch.Tensor): Indices with shape\n                [num_orientations, H, W, num_rotations].\n\n        Returns:\n            torch.Tensor: Refined features with shape [num_output_planes *\n            num_rotations, num_input_planes * num_orientations, H, W].\n        \"\"\"\n        ctx.save_for_backward(input, indices)\n        op, ip, o, h, w = input.size()\n        o, h, w, r = indices.size()\n        output = input.new_zeros((op * r, ip * o, h, w))\n        ext_module.active_rotated_filter_forward(input, indices, output)\n\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:\n        \"\"\"\n        Args:\n            grad_output (torch.Tensor): The gradient of output features\n                with shape [num_output_planes * num_rotations,\n                num_input_planes * num_orientations, H, W].\n\n        Returns:\n            torch.Tensor: The gradient of input features with shape\n            [num_output_planes, num_input_planes, num_orientations, H, W].\n        \"\"\"\n        input, indices = ctx.saved_tensors\n        grad_in = torch.zeros_like(input)\n        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)\n        return grad_in, None\n\n\nactive_rotated_filter = ActiveRotatedFilterFunction.apply\n"
  },
  {
    "path": "mmcv/ops/assign_score_withk.py",
    "content": "from typing import Tuple\n\nimport torch\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])\n\n\nclass AssignScoreWithK(Function):\n    r\"\"\"Perform weighted sum to generate output features according to scores.\n    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/\n    scene_seg/lib/paconv_lib/src/gpu>`_.\n\n    This is a memory-efficient CUDA implementation of assign_scores operation,\n    which first transform all point features with weight bank, then assemble\n    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.\n\n    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for\n        more detailed descriptions.\n\n    Note:\n        This implementation assumes using ``neighbor`` kernel input, which is\n            (point_features - center_features, point_features).\n        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/\n        pointnet2/paconv.py#L128 for more details.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx,\n                scores: torch.Tensor,\n                point_features: torch.Tensor,\n                center_features: torch.Tensor,\n                knn_idx: torch.Tensor,\n                aggregate: str = 'sum') -> torch.Tensor:\n        \"\"\"\n        Args:\n            scores (torch.Tensor): (B, npoint, K, M), predicted scores to\n                aggregate weight matrices in the weight bank.\n                ``npoint`` is the number of sampled centers.\n                ``K`` is the number of queried neighbors.\n                ``M`` is the number of weight matrices in the weight bank.\n            point_features (torch.Tensor): (B, N, M, out_dim)\n                Pre-computed point features to be aggregated.\n            center_features (torch.Tensor): (B, N, M, out_dim)\n                Pre-computed center features to be aggregated.\n            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.\n                We assume the first idx in each row is the idx of the center.\n            aggregate (str, optional): Aggregation method.\n                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.\n\n        Returns:\n            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.\n        \"\"\"\n        agg = {'sum': 0, 'avg': 1, 'max': 2}\n\n        B, N, M, out_dim = point_features.size()\n        _, npoint, K, _ = scores.size()\n\n        output = point_features.new_zeros((B, out_dim, npoint, K))\n        ext_module.assign_score_withk_forward(\n            point_features.contiguous(),\n            center_features.contiguous(),\n            scores.contiguous(),\n            knn_idx.contiguous(),\n            output,\n            B=B,\n            N0=N,\n            N1=npoint,\n            M=M,\n            K=K,\n            O=out_dim,\n            aggregate=agg[aggregate])\n\n        ctx.save_for_backward(output, point_features, center_features, scores,\n                              knn_idx)\n        ctx.agg = agg[aggregate]\n\n        return output\n\n    @staticmethod\n    def backward(\n        ctx, grad_out: torch.Tensor\n    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:\n        \"\"\"\n        Args:\n            grad_out (torch.Tensor): (B, out_dim, npoint, K)\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains five elements. The first one\n            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The\n            second is the gradient of ``point_features`` whose shape is\n            (B, N, M, out_dim). The third is the gradient of\n            ``center_features`` with the shape of (B, N, M, out_dim). The last\n            two are ``None``.\n        \"\"\"\n        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors\n\n        agg = ctx.agg\n\n        B, N, M, out_dim = point_features.size()\n        _, npoint, K, _ = scores.size()\n\n        grad_point_features = point_features.new_zeros(point_features.shape)\n        grad_center_features = center_features.new_zeros(center_features.shape)\n        grad_scores = scores.new_zeros(scores.shape)\n\n        ext_module.assign_score_withk_backward(\n            grad_out.contiguous(),\n            point_features.contiguous(),\n            center_features.contiguous(),\n            scores.contiguous(),\n            knn_idx.contiguous(),\n            grad_point_features,\n            grad_center_features,\n            grad_scores,\n            B=B,\n            N0=N,\n            N1=npoint,\n            M=M,\n            K=K,\n            O=out_dim,\n            aggregate=agg)\n\n        return grad_scores, grad_point_features, \\\n            grad_center_features, None, None\n\n\nassign_score_withk = AssignScoreWithK.apply\n"
  },
  {
    "path": "mmcv/ops/ball_query.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Tuple\n\nimport torch\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['ball_query_forward', 'stack_ball_query_forward'])\n\n\nclass BallQuery(Function):\n    \"\"\"Find nearby points in spherical space.\"\"\"\n\n    @staticmethod\n    def forward(\n            ctx,\n            min_radius: float,\n            max_radius: float,\n            sample_num: int,\n            xyz: torch.Tensor,\n            center_xyz: torch.Tensor,\n            xyz_batch_cnt: Optional[torch.Tensor] = None,\n            center_xyz_batch_cnt: Optional[torch.Tensor] = None\n    ) -> torch.Tensor:\n        \"\"\"\n        Args:\n            min_radius (float): minimum radius of the balls.\n            max_radius (float): maximum radius of the balls.\n            sample_num (int): maximum number of features in the balls.\n            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,\n                or staked input (N1 + N2 ..., 3).\n            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball\n                query, or staked input (M1 + M2 ..., 3).\n            xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in\n                each batch, just like (N1, N2, ...). Defaults to None.\n                New in version 1.7.0.\n            center_xyz_batch_cnt: (batch_size): Stacked centers coordinates\n                nums in each batch, just line (M1, M2, ...). Defaults to None.\n                New in version 1.7.0.\n\n        Returns:\n            torch.Tensor: (B, npoint, nsample) tensor with the indices of the\n            features that form the query balls.\n        \"\"\"\n        assert center_xyz.is_contiguous()\n        assert xyz.is_contiguous()\n        assert min_radius < max_radius\n        if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:\n            assert xyz_batch_cnt.dtype == torch.int\n            assert center_xyz_batch_cnt.dtype == torch.int\n            idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),\n                                       dtype=torch.int32)\n            ext_module.stack_ball_query_forward(\n                center_xyz,\n                center_xyz_batch_cnt,\n                xyz,\n                xyz_batch_cnt,\n                idx,\n                max_radius=max_radius,\n                nsample=sample_num,\n            )\n        else:\n            B, N, _ = xyz.size()\n            npoint = center_xyz.size(1)\n            idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)\n            ext_module.ball_query_forward(\n                center_xyz,\n                xyz,\n                idx,\n                b=B,\n                n=N,\n                m=npoint,\n                min_radius=min_radius,\n                max_radius=max_radius,\n                nsample=sample_num)\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(idx)\n        return idx\n\n    @staticmethod\n    def backward(ctx, a=None) -> Tuple[None, None, None, None]:\n        return None, None, None, None\n\n\nball_query = BallQuery.apply\n"
  },
  {
    "path": "mmcv/ops/bbox.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])\n\n\ndef _bbox_overlaps_cpu(bboxes1: torch.Tensor,\n                       bboxes2: torch.Tensor,\n                       mode: str = 'iou',\n                       aligned: bool = False,\n                       offset: int = 0) -> torch.Tensor:\n    assert mode in ['iou', 'iof']\n\n    if aligned:\n        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]\n        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]\n\n        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]\n        overlap = wh[:, 0] * wh[:, 1]\n        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (\n            bboxes1[:, 3] - bboxes1[:, 1] + offset)\n\n        if mode == 'iou':\n            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (\n                bboxes2[:, 3] - bboxes2[:, 1] + offset)\n            ious = overlap / (area1 + area2 - overlap)\n        else:\n            ious = overlap / area1\n    else:\n        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]\n        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]\n\n        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]\n        overlap = wh[:, :, 0] * wh[:, :, 1]\n        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (\n            bboxes1[:, 3] - bboxes1[:, 1] + offset)\n\n        if mode == 'iou':\n            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (\n                bboxes2[:, 3] - bboxes2[:, 1] + offset)\n            ious = overlap / (area1[:, None] + area2 - overlap)\n        else:\n            ious = overlap / (area1[:, None])\n\n    return ious\n\n\ndef bbox_overlaps(bboxes1: torch.Tensor,\n                  bboxes2: torch.Tensor,\n                  mode: str = 'iou',\n                  aligned: bool = False,\n                  offset: int = 0) -> torch.Tensor:\n    \"\"\"Calculate overlap between two set of bboxes.\n\n    If ``aligned`` is ``False``, then calculate the ious between each bbox\n    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of\n    bboxes1 and bboxes2.\n\n    Args:\n        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or\n            empty.\n        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or\n            empty. If aligned is ``True``, then m and n must be equal.\n        mode (str): \"iou\" (intersection over union) or iof (intersection over\n            foreground).\n\n    Returns:\n        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is\n        ``False``, the shape of ious is (m, n) else (m, 1).\n\n    Example:\n        >>> bboxes1 = torch.FloatTensor([\n        >>>     [0, 0, 10, 10],\n        >>>     [10, 10, 20, 20],\n        >>>     [32, 32, 38, 42],\n        >>> ])\n        >>> bboxes2 = torch.FloatTensor([\n        >>>     [0, 0, 10, 20],\n        >>>     [0, 10, 10, 19],\n        >>>     [10, 10, 20, 20],\n        >>> ])\n        >>> bbox_overlaps(bboxes1, bboxes2)\n        tensor([[0.5000, 0.0000, 0.0000],\n                [0.0000, 0.0000, 1.0000],\n                [0.0000, 0.0000, 0.0000]])\n\n    Example:\n        >>> empty = torch.FloatTensor([])\n        >>> nonempty = torch.FloatTensor([\n        >>>     [0, 0, 10, 9],\n        >>> ])\n        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)\n        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)\n        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)\n    \"\"\"\n\n    mode_dict = {'iou': 0, 'iof': 1}\n    assert mode in mode_dict.keys()\n    mode_flag = mode_dict[mode]\n    # Either the boxes are empty or the length of boxes' last dimension is 4\n    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)\n    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)\n    assert offset == 1 or offset == 0\n\n    rows = bboxes1.size(0)\n    cols = bboxes2.size(0)\n\n    if aligned:\n        assert rows == cols\n        ious = bboxes1.new_zeros(rows)\n    else:\n        ious = bboxes1.new_zeros((rows, cols))\n\n    if rows * cols == 0:\n        return ious\n\n    if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':\n        return _bbox_overlaps_cpu(\n            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)\n\n    ext_module.bbox_overlaps(\n        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)\n\n    return ious\n"
  },
  {
    "path": "mmcv/ops/bezier_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['bezier_align_forward', 'bezier_align_backward'])\n\n\nclass BezierAlignFunction(Function):\n\n    @staticmethod\n    def forward(ctx,\n                input: torch.Tensor,\n                beziers: torch.Tensor,\n                output_size: Union[int, Tuple[int, int]],\n                spatial_scale: Union[int, float] = 1.0,\n                sampling_ratio: int = 0,\n                aligned: bool = True) -> torch.Tensor:\n        ctx.output_size = _pair(output_size)\n        ctx.spatial_scale = spatial_scale\n        ctx.input_shape = input.size()\n        ctx.sampling_ratio = sampling_ratio\n        ctx.aligned = aligned\n\n        assert beziers.size(1) == 17\n        output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],\n                        ctx.output_size[1])\n        output = input.new_zeros(output_shape)\n        ext_module.bezier_align_forward(\n            input,\n            beziers,\n            output,\n            aligned_height=ctx.output_size[0],\n            aligned_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            aligned=ctx.aligned)\n\n        ctx.save_for_backward(beziers)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output: torch.Tensor):\n        beziers = ctx.saved_tensors[0]\n        grad_input = grad_output.new_zeros(ctx.input_shape)\n        grad_output = grad_output.contiguous()\n        ext_module.bezier_align_backward(\n            grad_output,\n            beziers,\n            grad_input,\n            aligned_height=ctx.output_size[0],\n            aligned_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            aligned=ctx.aligned)\n        return grad_input, None, None, None, None, None\n\n\nbezier_align = BezierAlignFunction.apply\n\n\nclass BezierAlign(nn.Module):\n    \"\"\"Bezier align pooling layer.\n\n    Args:\n        output_size (tuple): h, w\n        spatial_scale (float): scale the input boxes by this number\n        sampling_ratio (int): number of inputs samples to take for each\n            output sample. 0 to take samples densely for current models.\n        aligned (bool): if False, use the legacy implementation in\n            MMDetection. If True, align the results more perfectly.\n\n    Note:\n        The implementation of BezierAlign is modified from\n        https://github.com/aim-uofa/AdelaiDet\n\n        The meaning of aligned=True:\n\n        Given a continuous coordinate c, its two neighboring pixel\n        indices (in our pixel model) are computed by floor(c - 0.5) and\n        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete\n        indices [0] and [1] (which are sampled from the underlying signal\n        at continuous coordinates 0.5 and 1.5). But the original roi_align\n        (aligned=False) does not subtract the 0.5 when computing\n        neighboring pixel indices and therefore it uses pixels with a\n        slightly incorrect alignment (relative to our pixel model) when\n        performing bilinear interpolation.\n\n        With `aligned=True`,\n        we first appropriately scale the ROI and then shift it by -0.5\n        prior to calling roi_align. This produces the correct neighbors;\n\n        The difference does not make a difference to the model's\n        performance if ROIAlign is used together with conv layers.\n    \"\"\"\n\n    def __init__(\n        self,\n        output_size: Tuple,\n        spatial_scale: Union[int, float],\n        sampling_ratio: int,\n        aligned: bool = True,\n    ) -> None:\n        super().__init__()\n\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n        self.sampling_ratio = int(sampling_ratio)\n        self.aligned = aligned\n\n    def forward(self, input: torch.Tensor,\n                beziers: torch.Tensor) -> torch.Tensor:\n        \"\"\"BezierAlign forward.\n\n        Args:\n            inputs (Tensor): input features.\n            beziers (Tensor): beziers for align.\n        \"\"\"\n        return bezier_align(input, beziers, self.output_size,\n                            self.spatial_scale, self.sampling_ratio,\n                            self.aligned)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(output_size={self.output_size}, '\n        s += f'spatial_scale={self.spatial_scale})'\n        s += f'sampling_ratio={self.sampling_ratio})'\n        s += f'aligned={self.aligned})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/bias_act.py",
    "content": "# Modified from\n# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py\n\n# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors retain all intellectual property\n# and proprietary rights in and to this software, related documentation\n# and any modifications thereto.  Any use, reproduction, disclosure or\n# distribution of this software and related documentation without an express\n# license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa\n\"\"\"Custom PyTorch ops for efficient bias and activation.\"\"\"\n\nfrom typing import Any, Dict, Optional, Union\n\nimport numpy as np\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['bias_act'])\n\n\nclass EasyDict(dict):\n    \"\"\"Convenience class that behaves like a dict but allows access with the\n    attribute syntax.\"\"\"\n\n    def __getattr__(self, name: str) -> Any:\n        try:\n            return self[name]\n        except KeyError:\n            raise AttributeError(name)\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        self[name] = value\n\n    def __delattr__(self, name: str) -> None:\n        del self[name]\n\n\nactivation_funcs = {\n    'linear':\n    EasyDict(\n        func=lambda x, **_: x,\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=1,\n        ref='',\n        has_2nd_grad=False),\n    'relu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.relu(x),\n        def_alpha=0,\n        def_gain=np.sqrt(2),\n        cuda_idx=2,\n        ref='y',\n        has_2nd_grad=False),\n    'lrelu':\n    EasyDict(\n        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),\n        def_alpha=0.2,\n        def_gain=np.sqrt(2),\n        cuda_idx=3,\n        ref='y',\n        has_2nd_grad=False),\n    'tanh':\n    EasyDict(\n        func=lambda x, **_: torch.tanh(x),\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=4,\n        ref='y',\n        has_2nd_grad=True),\n    'sigmoid':\n    EasyDict(\n        func=lambda x, **_: torch.sigmoid(x),\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=5,\n        ref='y',\n        has_2nd_grad=True),\n    'elu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.elu(x),\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=6,\n        ref='y',\n        has_2nd_grad=True),\n    'selu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.selu(x),\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=7,\n        ref='y',\n        has_2nd_grad=True),\n    'softplus':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.softplus(x),\n        def_alpha=0,\n        def_gain=1,\n        cuda_idx=8,\n        ref='y',\n        has_2nd_grad=True),\n    'swish':\n    EasyDict(\n        func=lambda x, **_: torch.sigmoid(x) * x,\n        def_alpha=0,\n        def_gain=np.sqrt(2),\n        cuda_idx=9,\n        ref='x',\n        has_2nd_grad=True),\n}\n\nactivation_funcs_musa = {\n    'linear':\n    EasyDict(\n        func=lambda x, **_: x,\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=1,\n        ref='',\n        has_2nd_grad=False),\n    'relu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.relu(x),\n        def_alpha=0,\n        def_gain=np.sqrt(2),\n        musa_idx=2,\n        ref='y',\n        has_2nd_grad=False),\n    'lrelu':\n    EasyDict(\n        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),\n        def_alpha=0.2,\n        def_gain=np.sqrt(2),\n        musa_idx=3,\n        ref='y',\n        has_2nd_grad=False),\n    'tanh':\n    EasyDict(\n        func=lambda x, **_: torch.tanh(x),\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=4,\n        ref='y',\n        has_2nd_grad=True),\n    'sigmoid':\n    EasyDict(\n        func=lambda x, **_: torch.sigmoid(x),\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=5,\n        ref='y',\n        has_2nd_grad=True),\n    'elu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.elu(x),\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=6,\n        ref='y',\n        has_2nd_grad=True),\n    'selu':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.selu(x),\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=7,\n        ref='y',\n        has_2nd_grad=True),\n    'softplus':\n    EasyDict(\n        func=lambda x, **_: torch.nn.functional.softplus(x),\n        def_alpha=0,\n        def_gain=1,\n        musa_idx=8,\n        ref='y',\n        has_2nd_grad=True),\n    'swish':\n    EasyDict(\n        func=lambda x, **_: torch.sigmoid(x) * x,\n        def_alpha=0,\n        def_gain=np.sqrt(2),\n        musa_idx=9,\n        ref='x',\n        has_2nd_grad=True),\n}\n\n_null_tensor = torch.empty([0])\n\n\ndef bias_act(input: torch.Tensor,\n             bias: Optional[torch.Tensor] = None,\n             dim: int = 1,\n             act: str = 'linear',\n             alpha: Optional[Union[float, int]] = None,\n             gain: Optional[float] = None,\n             clamp: Optional[float] = None,\n             use_custom_op: bool = True):\n    r\"\"\"Fused bias and activation function.\n\n    Adds `bias` to activation tensor `input`, and evaluates activation\n    function `act`, and scales the result by `gain`. Each of the steps is\n    optional.\n\n    In most cases, the fused op is considerably more efficient than performing\n    the same calculation using standard PyTorch ops. It supports first and\n    second order gradients, but not third order gradients.\n\n    Args:\n        input (torch.Tensor): Input activation tensor. Can be of any shape.\n        bias (torch.Tensor): Bias vector, or `None` to disable.\n            Must be a 1D tensor of the same type as `input`. The shape must\n            be known, and it must match the dimension of `input` corresponding\n            to `dim`. Defaults to None.\n        dim (int): The dimension in `input` corresponding to the elements of\n            `bias`. The value of `dim` is ignored if `b` is not specified.\n            Defaults to 1.\n        act (str): Name of the activation function to evaluate, or `\"linear\"`\n            to disable. Can be e.g. \"relu\", \"lrelu\", \"tanh\", \"sigmoid\",\n            \"swish\", etc. See `activation_funcs` for a full list. `None` is not\n            allowed. Defaults to `linear`.\n        alpha (float or int): Shape parameter for the activation\n            function, or `None` to use the default. Defaults to None.\n        gain (float): Scaling factor for the output tensor, or `None`\n            to use default. See `activation_funcs` for the default scaling of\n            each activation function. If unsure, consider specifying 1.\n            Defaults to None.\n        clamp (float):  Clamp the output values to `[-clamp, +clamp]`,\n            or `None` to disable the clamping (default). Defaults to None.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        torch.Tensor: Tensor of the same shape and datatype as `input`.\n    \"\"\"\n    assert isinstance(input, torch.Tensor)\n    if use_custom_op and input.is_cuda:\n        return _bias_act_cuda(\n            dim=dim, act=act, alpha=alpha, gain=gain,\n            clamp=clamp).apply(input, bias)\n    try:\n        if use_custom_op and input.is_musa:\n            return _bias_act_musa(\n                dim=dim, act=act, alpha=alpha, gain=gain,\n                clamp=clamp).apply(input, bias)\n    except AttributeError:\n        pass\n    return _bias_act_ref(\n        input=input,\n        bias=bias,\n        dim=dim,\n        act=act,\n        alpha=alpha,\n        gain=gain,\n        clamp=clamp)\n\n\ndef _bias_act_ref(input: torch.Tensor,\n                  bias: Optional[torch.Tensor] = None,\n                  dim: int = 1,\n                  act: str = 'linear',\n                  alpha: Optional[Union[float, int]] = None,\n                  gain: Optional[float] = None,\n                  clamp: Optional[float] = None):\n    \"\"\"Slow reference implementation of `bias_act()` using standard PyTorch\n    ops.\n\n    Adds `bias` to activation tensor `input`, and evaluates activation\n    function `act`, and scales the result by `gain`. Each of the steps is\n    optional.\n\n    In most cases, the fused op is considerably more efficient than performing\n    the same calculation using standard PyTorch ops. It supports first and\n    second order gradients, but not third order gradients.\n\n    Args:\n        input (torch.Tensor): Input activation tensor. Can be of any shape.\n        bias (torch.Tensor): Bias vector, or `None` to disable.\n            Must be a 1D tensor of the same type as `input`. The shape must\n            be known, and it must match the dimension of `input` corresponding\n            to `dim`. Defaults to None.\n        dim (int): The dimension in `input` corresponding to the elements of\n            `bias`. The value of `dim` is ignored if `b` is not specified.\n            Defaults to 1.\n        act (str): Name of the activation function to evaluate, or `\"linear\"`\n            to disable. Can be e.g. \"relu\", \"lrelu\", \"tanh\", \"sigmoid\",\n            \"swish\", etc. See `activation_funcs` for a full list. `None` is not\n            allowed. Defaults to `linear`.\n        alpha (float or int): Shape parameter for the activation\n            function, or `None` to use the default. Defaults to None.\n        gain (float): Scaling factor for the output tensor, or `None`\n            to use default. See `activation_funcs` for the default scaling of\n            each activation function. If unsure, consider specifying 1.\n            Defaults to None.\n        clamp (float):  Clamp the output values to\n            `[-clamp, +clamp]`, or `None` to disable the clamping (default).\n            Defaults to None.\n\n    Returns:\n        torch.Tensor: Tensor of the same shape and datatype as `input`.\n    \"\"\"\n    assert isinstance(input, torch.Tensor)\n    assert clamp is None or clamp >= 0\n    spec = activation_funcs[act]\n    alpha = float(alpha if alpha is not None else spec.def_alpha)\n    gain = float(gain if gain is not None else spec.def_gain)\n    clamp = float(clamp if clamp is not None else -1)\n\n    # Add bias.\n    if bias is not None:\n        assert isinstance(bias, torch.Tensor) and bias.ndim == 1\n        assert 0 <= dim < input.ndim\n        assert bias.shape[0] == input.shape[dim]\n        input = input + bias.reshape(\n            [-1 if i == dim else 1 for i in range(input.ndim)])\n\n    # Evaluate activation function.\n    alpha = float(alpha)\n    output = spec.func(input, alpha=alpha)\n\n    # Scale by gain.\n    gain = float(gain)\n    if gain != 1:\n        output = output * gain\n\n    # Clamp.\n    if clamp >= 0:\n        # pylint: disable=invalid-unary-operand-type\n        output = output.clamp(-clamp, clamp)\n    return output\n\n\n_bias_act_cuda_cache: Dict = dict()\n\n\ndef _bias_act_cuda(dim: int = 1,\n                   act: str = 'linear',\n                   alpha: Optional[Union[float, int]] = None,\n                   gain: Optional[float] = None,\n                   clamp: Optional[float] = None):\n    \"\"\"\"Fast CUDA implementation of `bias_act()` using custom ops.\n\n    Args:\n        dim (int): The dimension in `x` corresponding to the elements of `b`.\n            The value of `dim` is ignored if `b` is not specified.\n            Defaults to 1.\n        act (str): Name of the activation function to evaluate, or `\"linear\"`\n            to disable. Can be e.g. \"relu\", \"lrelu\", \"tanh\", \"sigmoid\",\n            \"swish\", etc. See `activation_funcs` for a full list. `None` is not\n            allowed. Defaults to `linear`.\n        alpha (float | int): Shape parameter for the activation\n            function, or `None` to use the default. Defaults to None.\n        gain (float): Scaling factor for the output tensor, or `None`\n            to use default. See `activation_funcs` for the default scaling of\n            each activation function. If unsure, consider specifying 1.\n            Defaults to None.\n        clamp (float): Clamp the output values to `[-clamp, +clamp]`,\n            or `None` to disable the clamping (default). Defaults to None.\n\n    Returns:\n        torch.Tensor: Tensor of the same shape and datatype as `x`.\n    \"\"\"\n    # Parse arguments.\n    assert clamp is None or clamp >= 0\n    spec = activation_funcs[act]\n    alpha = float(alpha if alpha is not None else spec.def_alpha)\n    gain = float(gain if gain is not None else spec.def_gain)\n    clamp = float(clamp if clamp is not None else -1)\n\n    # Lookup from cache.\n    key = (dim, act, alpha, gain, clamp)\n    if key in _bias_act_cuda_cache:\n        return _bias_act_cuda_cache[key]\n\n    # Forward op.\n    class BiasActCuda(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, x, b):  # pylint: disable=arguments-differ\n            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(\n                1) == 1 else torch.contiguous_format\n            x = x.contiguous(memory_format=ctx.memory_format)\n            b = b.contiguous() if b is not None else _null_tensor.to(x.device)\n            y = x\n            if act != 'linear' or gain != 1 or clamp >= 0 or (\n                    b is not _null_tensor.to(x.device)):\n                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),\n                                        _null_tensor.to(x.device),\n                                        _null_tensor.to(x.device), 0, dim,\n                                        spec.cuda_idx, alpha, gain, clamp)\n            ctx.save_for_backward(\n                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(\n                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else\n                _null_tensor.to(x.device),\n                y if 'y' in spec.ref else _null_tensor.to(x.device))\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            dy = dy.contiguous(memory_format=ctx.memory_format)\n            x, b, y = ctx.saved_tensors\n            dx = None\n            db = None\n\n            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:\n                dx = dy\n                if act != 'linear' or gain != 1 or clamp >= 0:\n                    dx = BiasActCudaGrad.apply(dy, x, b, y)\n\n            if ctx.needs_input_grad[1]:\n                db = dx.sum([i for i in range(dx.ndim) if i != dim])\n\n            return dx, db\n\n    # Backward op.\n    class BiasActCudaGrad(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ\n            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (\n                dy.stride(1) == 1) else torch.contiguous_format\n            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,\n                                     dim, spec.cuda_idx, alpha, gain, clamp)\n            ctx.save_for_backward(\n                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,\n                y)\n            return dx\n\n        @staticmethod\n        def backward(ctx, d_dx):  # pylint: disable=arguments-differ\n            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)\n            dy, x, b, y = ctx.saved_tensors\n            d_dy = None\n            d_x = None\n            d_b = None\n            d_y = None\n\n            if ctx.needs_input_grad[0]:\n                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)\n\n            if spec.has_2nd_grad and (ctx.needs_input_grad[1]\n                                      or ctx.needs_input_grad[2]):\n                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,\n                                          spec.cuda_idx, alpha, gain, clamp)\n\n            if spec.has_2nd_grad and ctx.needs_input_grad[2]:\n                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])\n\n            return d_dy, d_x, d_b, d_y\n\n    # Add to cache.\n    _bias_act_cuda_cache[key] = BiasActCuda\n    return BiasActCuda\n\n\n_bias_act_musa_cache: Dict = dict()\n\n\ndef _bias_act_musa(dim: int = 1,\n                   act: str = 'linear',\n                   alpha: Optional[Union[float, int]] = None,\n                   gain: Optional[float] = None,\n                   clamp: Optional[float] = None):\n    \"\"\"\"Fast MUSA implementation of `bias_act()` using custom ops.\n\n    Args:\n        dim (int): The dimension in `x` corresponding to the elements of `b`.\n            The value of `dim` is ignored if `b` is not specified.\n            Defaults to 1.\n        act (str): Name of the activation function to evaluate, or `\"linear\"`\n            to disable. Can be e.g. \"relu\", \"lrelu\", \"tanh\", \"sigmoid\",\n            \"swish\", etc. See `activation_funcs_musa` for a full list. `None`\n            is not allowed. Defaults to `linear`.\n        alpha (float | int): Shape parameter for the activation\n            function, or `None` to use the default. Defaults to None.\n        gain (float): Scaling factor for the output tensor, or `None`\n            to use default. See `activation_funcs_musa` for the default scaling\n            of each activation function. If unsure, consider specifying 1.\n            Defaults to None.\n        clamp (float): Clamp the output values to `[-clamp, +clamp]`,\n            or `None` to disable the clamping (default). Defaults to None.\n\n    Returns:\n        torch.Tensor: Tensor of the same shape and datatype as `x`.\n    \"\"\"\n    # Parse arguments.\n    assert clamp is None or clamp >= 0\n    spec = activation_funcs_musa[act]\n    alpha = float(alpha if alpha is not None else spec.def_alpha)\n    gain = float(gain if gain is not None else spec.def_gain)\n    clamp = float(clamp if clamp is not None else -1)\n\n    # Lookup from cache.\n    key = (dim, act, alpha, gain, clamp)\n    if key in _bias_act_musa_cache:\n        return _bias_act_musa_cache[key]\n\n    # Forward op.\n    class BiasActMusa(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, x, b):  # pylint: disable=arguments-differ\n            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(\n                1) == 1 else torch.contiguous_format\n            x = x.contiguous(memory_format=ctx.memory_format)\n            b = b.contiguous() if b is not None else _null_tensor.to(x.device)\n            y = x\n            if act != 'linear' or gain != 1 or clamp >= 0 or (\n                    b is not _null_tensor.to(x.device)):\n                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),\n                                        _null_tensor.to(x.device),\n                                        _null_tensor.to(x.device), 0, dim,\n                                        spec.musa_idx, alpha, gain, clamp)\n            ctx.save_for_backward(\n                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(\n                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else\n                _null_tensor.to(x.device),\n                y if 'y' in spec.ref else _null_tensor.to(x.device))\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            dy = dy.contiguous(memory_format=ctx.memory_format)\n            x, b, y = ctx.saved_tensors\n            dx = None\n            db = None\n\n            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:\n                dx = dy\n                if act != 'linear' or gain != 1 or clamp >= 0:\n                    dx = BiasActMusaGrad.apply(dy, x, b, y)\n\n            if ctx.needs_input_grad[1]:\n                db = dx.sum([i for i in range(dx.ndim) if i != dim])\n\n            return dx, db\n\n    # Backward op.\n    class BiasActMusaGrad(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ\n            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (\n                dy.stride(1) == 1) else torch.contiguous_format\n            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,\n                                     dim, spec.musa_idx, alpha, gain, clamp)\n            ctx.save_for_backward(\n                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,\n                y)\n            return dx\n\n        @staticmethod\n        def backward(ctx, d_dx):  # pylint: disable=arguments-differ\n            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)\n            dy, x, b, y = ctx.saved_tensors\n            d_dy = None\n            d_x = None\n            d_b = None\n            d_y = None\n\n            if ctx.needs_input_grad[0]:\n                d_dy = BiasActMusaGrad.apply(d_dx, x, b, y)\n\n            if spec.has_2nd_grad and (ctx.needs_input_grad[1]\n                                      or ctx.needs_input_grad[2]):\n                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,\n                                          spec.musa_idx, alpha, gain, clamp)\n\n            if spec.has_2nd_grad and ctx.needs_input_grad[2]:\n                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])\n\n            return d_dy, d_x, d_b, d_y\n\n    # Add to cache.\n    _bias_act_musa_cache[key] = BiasActMusa\n    return BiasActMusa\n"
  },
  {
    "path": "mmcv/ops/border_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n# modified from\n# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py\n\nfrom typing import Tuple\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['border_align_forward', 'border_align_backward'])\n\n\nclass BorderAlignFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, boxes, pool_size):\n        return g.op(\n            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)\n\n    @staticmethod\n    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,\n                pool_size: int) -> torch.Tensor:\n        ctx.pool_size = pool_size\n        ctx.input_shape = input.size()\n\n        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'\n        assert boxes.size(2) == 4, \\\n            'the last dimension of boxes must be (x1, y1, x2, y2)'\n        assert input.size(1) % 4 == 0, \\\n            'the channel for input feature must be divisible by factor 4'\n\n        # [B, C//4, H*W, 4]\n        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)\n        output = input.new_zeros(output_shape)\n        # `argmax_idx` only used for backward\n        argmax_idx = input.new_zeros(output_shape).to(torch.int)\n\n        ext_module.border_align_forward(\n            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)\n\n        ctx.save_for_backward(boxes, argmax_idx)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx,\n                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:\n        boxes, argmax_idx = ctx.saved_tensors\n        grad_input = grad_output.new_zeros(ctx.input_shape)\n        # complex head architecture may cause grad_output uncontiguous\n        grad_output = grad_output.contiguous()\n        ext_module.border_align_backward(\n            grad_output,\n            boxes,\n            argmax_idx,\n            grad_input,\n            pool_size=ctx.pool_size)\n        return grad_input, None, None\n\n\nborder_align = BorderAlignFunction.apply\n\n\nclass BorderAlign(nn.Module):\n    r\"\"\"Border align pooling layer.\n\n    Applies border_align over the input feature based on predicted bboxes.\n    The details were described in the paper\n    `BorderDet: Border Feature for Dense Object Detection\n    <https://arxiv.org/abs/2007.11056>`_.\n\n    For each border line (e.g. top, left, bottom or right) of each box,\n    border_align does the following:\n\n    1. uniformly samples ``pool_size`` +1 positions on this line, involving\n       the start and end points.\n    2. the corresponding features on these points are computed by bilinear\n       interpolation.\n    3. max pooling over all the ``pool_size`` +1 positions are used for\n       computing pooled feature.\n\n    Args:\n        pool_size (int): number of positions sampled over the boxes' borders\n            (e.g. top, bottom, left, right).\n    \"\"\"\n\n    def __init__(self, pool_size: int):\n        super().__init__()\n        self.pool_size = pool_size\n\n    def forward(self, input: torch.Tensor,\n                boxes: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),\n                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,\n                right features respectively.\n            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).\n\n        Returns:\n            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is\n            (top,left,bottom,right) for the last dimension.\n        \"\"\"\n        return border_align(input, boxes, self.pool_size)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(pool_size={self.pool_size})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/box_iou_quadri.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])\n\n\ndef box_iou_quadri(bboxes1: torch.Tensor,\n                   bboxes2: torch.Tensor,\n                   mode: str = 'iou',\n                   aligned: bool = False) -> torch.Tensor:\n    \"\"\"Return intersection-over-union (Jaccard index) of boxes.\n\n    Both sets of boxes are expected to be in\n    (x1, y1, ..., x4, y4) format.\n\n    If ``aligned`` is ``False``, then calculate the ious between each bbox\n    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of\n    bboxes1 and bboxes2.\n\n    Args:\n        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),\n            indicating (x1, y1, ..., x4, y4) for each row.\n        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),\n            indicating (x1, y1, ..., x4, y4) for each row.\n        mode (str): \"iou\" (intersection over union) or iof (intersection over\n            foreground).\n\n    Returns:\n        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is\n        ``False``, the shape of ious is (N, M) else (N,).\n    \"\"\"\n    assert mode in ['iou', 'iof']\n    mode_dict = {'iou': 0, 'iof': 1}\n    mode_flag = mode_dict[mode]\n    rows = bboxes1.size(0)\n    cols = bboxes2.size(0)\n    if aligned:\n        ious = bboxes1.new_zeros(rows)\n    else:\n        ious = bboxes1.new_zeros(rows * cols)\n    bboxes1 = bboxes1.contiguous()\n    bboxes2 = bboxes2.contiguous()\n    ext_module.box_iou_quadri(\n        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)\n    if not aligned:\n        ious = ious.view(rows, cols)\n    return ious\n"
  },
  {
    "path": "mmcv/ops/box_iou_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])\n\n\ndef box_iou_rotated(bboxes1: torch.Tensor,\n                    bboxes2: torch.Tensor,\n                    mode: str = 'iou',\n                    aligned: bool = False,\n                    clockwise: bool = True) -> torch.Tensor:\n    \"\"\"Return intersection-over-union (Jaccard index) of boxes.\n\n    Both sets of boxes are expected to be in\n    (x_center, y_center, width, height, angle) format.\n\n    If ``aligned`` is ``False``, then calculate the ious between each bbox\n    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of\n    bboxes1 and bboxes2.\n\n    .. note::\n        The operator assumes:\n\n        1) The positive direction along x axis is left -> right.\n\n        2) The positive direction along y axis is top -> down.\n\n        3) The w border is in parallel with x axis when angle = 0.\n\n        However, there are 2 opposite definitions of the positive angular\n        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports\n        both definitions and uses CW by default.\n\n        Please set ``clockwise=False`` if you are using the CCW definition.\n\n        The coordinate system when ``clockwise`` is ``True`` (default)\n\n            .. code-block:: none\n\n                0-------------------> x (0 rad)\n                |  A-------------B\n                |  |             |\n                |  |     box     h\n                |  |   angle=0   |\n                |  D------w------C\n                v\n                y (pi/2 rad)\n\n            In such coordination system the rotation matrix is\n\n            .. math::\n                \\\\begin{pmatrix}\n                \\\\cos\\\\alpha & -\\\\sin\\\\alpha \\\\\\\\\n                \\\\sin\\\\alpha & \\\\cos\\\\alpha\n                \\\\end{pmatrix}\n\n            The coordinates of the corner point A can be calculated as:\n\n            .. math::\n                P_A=\n                \\\\begin{pmatrix} x_A \\\\\\\\ y_A\\\\end{pmatrix}\n                =\n                \\\\begin{pmatrix} x_{center} \\\\\\\\ y_{center}\\\\end{pmatrix} +\n                \\\\begin{pmatrix}\\\\cos\\\\alpha & -\\\\sin\\\\alpha \\\\\\\\\n                \\\\sin\\\\alpha & \\\\cos\\\\alpha\\\\end{pmatrix}\n                \\\\begin{pmatrix} -0.5w \\\\\\\\ -0.5h\\\\end{pmatrix} \\\\\\\\\n                =\n                \\\\begin{pmatrix} x_{center}-0.5w\\\\cos\\\\alpha+0.5h\\\\sin\\\\alpha\n                \\\\\\\\\n                y_{center}-0.5w\\\\sin\\\\alpha-0.5h\\\\cos\\\\alpha\\\\end{pmatrix}\n\n\n        The coordinate system when ``clockwise`` is ``False``\n\n            .. code-block:: none\n\n                0-------------------> x (0 rad)\n                |  A-------------B\n                |  |             |\n                |  |     box     h\n                |  |   angle=0   |\n                |  D------w------C\n                v\n                y (-pi/2 rad)\n\n            In such coordination system the rotation matrix is\n\n            .. math::\n                \\\\begin{pmatrix}\n                \\\\cos\\\\alpha & \\\\sin\\\\alpha \\\\\\\\\n                -\\\\sin\\\\alpha & \\\\cos\\\\alpha\n                \\\\end{pmatrix}\n\n            The coordinates of the corner point A can be calculated as:\n\n            .. math::\n                P_A=\n                \\\\begin{pmatrix} x_A \\\\\\\\ y_A\\\\end{pmatrix}\n                =\n                \\\\begin{pmatrix} x_{center} \\\\\\\\ y_{center}\\\\end{pmatrix} +\n                \\\\begin{pmatrix}\\\\cos\\\\alpha & \\\\sin\\\\alpha \\\\\\\\\n                -\\\\sin\\\\alpha & \\\\cos\\\\alpha\\\\end{pmatrix}\n                \\\\begin{pmatrix} -0.5w \\\\\\\\ -0.5h\\\\end{pmatrix} \\\\\\\\\n                =\n                \\\\begin{pmatrix} x_{center}-0.5w\\\\cos\\\\alpha-0.5h\\\\sin\\\\alpha\n                \\\\\\\\\n                y_{center}+0.5w\\\\sin\\\\alpha-0.5h\\\\cos\\\\alpha\\\\end{pmatrix}\n\n    Args:\n        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),\n            indicating (x, y, w, h, theta) for each row. Note that theta is in\n            radian.\n        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),\n            indicating (x, y, w, h, theta) for each row. Note that theta is in\n            radian.\n        mode (str): \"iou\" (intersection over union) or iof (intersection over\n            foreground).\n        clockwise (bool): flag indicating whether the positive angular\n            orientation is clockwise. default True.\n            `New in version 1.4.3.`\n\n    Returns:\n        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is\n        ``False``, the shape of ious is (N, M) else (N,).\n    \"\"\"\n    assert mode in ['iou', 'iof']\n    mode_dict = {'iou': 0, 'iof': 1}\n    mode_flag = mode_dict[mode]\n    rows = bboxes1.size(0)\n    cols = bboxes2.size(0)\n    if aligned:\n        ious = bboxes1.new_zeros(rows)\n    else:\n        if bboxes1.device.type == 'mlu':\n            ious = bboxes1.new_zeros([rows, cols])\n        else:\n            ious = bboxes1.new_zeros(rows * cols)\n    if not clockwise:\n        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])\n        flip_mat[-1] = -1\n        bboxes1 = bboxes1 * flip_mat\n        bboxes2 = bboxes2 * flip_mat\n    bboxes1 = bboxes1.contiguous()\n    bboxes2 = bboxes2.contiguous()\n    ext_module.box_iou_rotated(\n        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)\n    if not aligned:\n        ious = ious.view(rows, cols)\n    return ious\n"
  },
  {
    "path": "mmcv/ops/carafe.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.model import normal_init, xavier_init\nfrom mmengine.registry import MODELS\nfrom torch import Tensor\nfrom torch.autograd import Function\nfrom torch.nn.modules.module import Module\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',\n    'carafe_backward'\n])\n\n\nclass CARAFENaiveFunction(Function):\n\n    @staticmethod\n    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,\n                 group_size: int, scale_factor: int) -> Tensor:\n        return g.op(\n            'mmcv::MMCVCARAFENaive',\n            features,\n            masks,\n            kernel_size_i=kernel_size,\n            group_size_i=group_size,\n            scale_factor_f=scale_factor)\n\n    @staticmethod\n    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,\n                group_size: int, scale_factor: int) -> Tensor:\n        assert scale_factor >= 1\n        assert masks.size(1) == kernel_size * kernel_size * group_size\n        assert masks.size(-1) == features.size(-1) * scale_factor\n        assert masks.size(-2) == features.size(-2) * scale_factor\n        assert features.size(1) % group_size == 0\n        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1\n        ctx.kernel_size = kernel_size\n        ctx.group_size = group_size\n        ctx.scale_factor = scale_factor\n        ctx.feature_size = features.size()\n        ctx.mask_size = masks.size()\n\n        n, c, h, w = features.size()\n        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))\n        ext_module.carafe_naive_forward(\n            features,\n            masks,\n            output,\n            kernel_size=kernel_size,\n            group_size=group_size,\n            scale_factor=scale_factor)\n\n        if features.requires_grad or masks.requires_grad or \\\n                torch.__version__ == 'parrots':\n            ctx.save_for_backward(features, masks)\n        return output\n\n    @staticmethod\n    def backward(\n            ctx,\n            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:\n        assert grad_output.is_cuda or grad_output.is_musa\n\n        features, masks = ctx.saved_tensors\n        kernel_size = ctx.kernel_size\n        group_size = ctx.group_size\n        scale_factor = ctx.scale_factor\n\n        grad_input = torch.zeros_like(features)\n        grad_masks = torch.zeros_like(masks)\n        ext_module.carafe_naive_backward(\n            grad_output.contiguous(),\n            features,\n            masks,\n            grad_input,\n            grad_masks,\n            kernel_size=kernel_size,\n            group_size=group_size,\n            scale_factor=scale_factor)\n\n        return grad_input, grad_masks, None, None, None\n\n\ncarafe_naive = CARAFENaiveFunction.apply\n\n\nclass CARAFENaive(Module):\n\n    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):\n        super().__init__()\n\n        assert isinstance(kernel_size, int) and isinstance(\n            group_size, int) and isinstance(scale_factor, int)\n        self.kernel_size = kernel_size\n        self.group_size = group_size\n        self.scale_factor = scale_factor\n\n    def forward(self, features: Tensor, masks: Tensor) -> Tensor:\n        return carafe_naive(features, masks, self.kernel_size, self.group_size,\n                            self.scale_factor)\n\n\nclass CARAFEFunction(Function):\n\n    @staticmethod\n    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,\n                 group_size: int, scale_factor: int) -> Tensor:\n        return g.op(\n            'mmcv::MMCVCARAFE',\n            features,\n            masks,\n            kernel_size_i=kernel_size,\n            group_size_i=group_size,\n            scale_factor_f=scale_factor)\n\n    @staticmethod\n    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,\n                group_size: int, scale_factor: int) -> Tensor:\n        assert scale_factor >= 1\n        assert masks.size(1) == kernel_size * kernel_size * group_size\n        assert masks.size(-1) == features.size(-1) * scale_factor\n        assert masks.size(-2) == features.size(-2) * scale_factor\n        assert features.size(1) % group_size == 0\n        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1\n        ctx.kernel_size = kernel_size\n        ctx.group_size = group_size\n        ctx.scale_factor = scale_factor\n        ctx.feature_size = features.size()\n        ctx.mask_size = masks.size()\n\n        n, c, h, w = features.size()\n        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))\n        routput = features.new_zeros(output.size(), requires_grad=False)\n        rfeatures = features.new_zeros(features.size(), requires_grad=False)\n        rmasks = masks.new_zeros(masks.size(), requires_grad=False)\n        ext_module.carafe_forward(\n            features,\n            masks,\n            rfeatures,\n            routput,\n            rmasks,\n            output,\n            kernel_size=kernel_size,\n            group_size=group_size,\n            scale_factor=scale_factor)\n\n        if features.requires_grad or masks.requires_grad or \\\n                torch.__version__ == 'parrots':\n            ctx.save_for_backward(features, masks, rfeatures)\n        return output\n\n    @staticmethod\n    def backward(\n            ctx,\n            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:\n        features, masks, rfeatures = ctx.saved_tensors\n        kernel_size = ctx.kernel_size\n        group_size = ctx.group_size\n        scale_factor = ctx.scale_factor\n\n        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)\n        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)\n        rgrad_input = torch.zeros_like(features, requires_grad=False)\n        rgrad_masks = torch.zeros_like(masks, requires_grad=False)\n        grad_input = torch.zeros_like(features, requires_grad=False)\n        grad_masks = torch.zeros_like(masks, requires_grad=False)\n        ext_module.carafe_backward(\n            grad_output.contiguous(),\n            rfeatures,\n            masks,\n            rgrad_output,\n            rgrad_input_hs,\n            rgrad_input,\n            rgrad_masks,\n            grad_input,\n            grad_masks,\n            kernel_size=kernel_size,\n            group_size=group_size,\n            scale_factor=scale_factor)\n        return grad_input, grad_masks, None, None, None\n\n\ncarafe = CARAFEFunction.apply\n\n\nclass CARAFE(Module):\n    \"\"\" CARAFE: Content-Aware ReAssembly of FEatures\n\n    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures\n    <https://arxiv.org/abs/1905.02188>`_ for more details.\n\n    Args:\n        kernel_size (int): reassemble kernel size\n        group_size (int): reassemble group size\n        scale_factor (int): upsample ratio\n\n    Returns:\n        upsampled feature map\n    \"\"\"\n\n    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):\n        super().__init__()\n\n        assert isinstance(kernel_size, int) and isinstance(\n            group_size, int) and isinstance(scale_factor, int)\n        self.kernel_size = kernel_size\n        self.group_size = group_size\n        self.scale_factor = scale_factor\n\n    def forward(self, features: Tensor, masks: Tensor) -> Tensor:\n        return carafe(features, masks, self.kernel_size, self.group_size,\n                      self.scale_factor)\n\n\n@MODELS.register_module(name='carafe')\nclass CARAFEPack(nn.Module):\n    \"\"\"A unified package of CARAFE upsampler that contains: 1) channel\n    compressor 2) content encoder 3) CARAFE op.\n\n    Official implementation of ICCV 2019 paper\n    `CARAFE: Content-Aware ReAssembly of FEatures\n    <https://arxiv.org/abs/1905.02188>`_.\n\n    Args:\n        channels (int): input feature channels\n        scale_factor (int): upsample ratio\n        up_kernel (int): kernel size of CARAFE op\n        up_group (int): group size of CARAFE op\n        encoder_kernel (int): kernel size of content encoder\n        encoder_dilation (int): dilation of content encoder\n        compressed_channels (int): output channels of channels compressor\n\n    Returns:\n        upsampled feature map\n    \"\"\"\n\n    def __init__(self,\n                 channels: int,\n                 scale_factor: int,\n                 up_kernel: int = 5,\n                 up_group: int = 1,\n                 encoder_kernel: int = 3,\n                 encoder_dilation: int = 1,\n                 compressed_channels: int = 64):\n        super().__init__()\n        self.channels = channels\n        self.scale_factor = scale_factor\n        self.up_kernel = up_kernel\n        self.up_group = up_group\n        self.encoder_kernel = encoder_kernel\n        self.encoder_dilation = encoder_dilation\n        self.compressed_channels = compressed_channels\n        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,\n                                            1)\n        self.content_encoder = nn.Conv2d(\n            self.compressed_channels,\n            self.up_kernel * self.up_kernel * self.up_group *\n            self.scale_factor * self.scale_factor,\n            self.encoder_kernel,\n            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),\n            dilation=self.encoder_dilation,\n            groups=1)\n        self.init_weights()\n\n    def init_weights(self):\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                xavier_init(m, distribution='uniform')\n        normal_init(self.content_encoder, std=0.001)\n\n    def kernel_normalizer(self, mask: Tensor) -> Tensor:\n        mask = F.pixel_shuffle(mask, self.scale_factor)\n        n, mask_c, h, w = mask.size()\n        # use float division explicitly,\n        # to void inconsistency while exporting to onnx\n        mask_channel = int(mask_c / float(self.up_kernel**2))\n        mask = mask.view(n, mask_channel, -1, h, w)\n\n        mask = F.softmax(mask, dim=2, dtype=mask.dtype)\n        mask = mask.view(n, mask_c, h, w).contiguous()\n\n        return mask\n\n    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:\n        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)\n        return x\n\n    def forward(self, x: Tensor) -> Tensor:\n        compressed_x = self.channel_compressor(x)\n        mask = self.content_encoder(compressed_x)\n        mask = self.kernel_normalizer(mask)\n\n        x = self.feature_reassemble(x, mask)\n        return x\n"
  },
  {
    "path": "mmcv/ops/cc_attention.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.registry import MODELS\n\nfrom mmcv.cnn import Scale\n\n\ndef NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:\n    \"\"\"Returns a diagonal matrix of size [n, n].\n\n    The diagonal are all \"-inf\". This is for avoiding calculating the\n    overlapped element in the Criss-Cross twice.\n    \"\"\"\n    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)\n\n\n@MODELS.register_module()\nclass CrissCrossAttention(nn.Module):\n    \"\"\"Criss-Cross Attention Module.\n\n    .. note::\n        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch\n        to a pure PyTorch and equivalent implementation. For more\n        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.\n\n        Speed comparison for one forward pass\n\n        - Input size: [2,512,97,97]\n        - Device: 1 NVIDIA GeForce RTX 2080 Ti\n\n        +-----------------------+---------------+------------+---------------+\n        |                       |PyTorch version|CUDA version|Relative speed |\n        +=======================+===============+============+===============+\n        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |\n        +-----------------------+---------------+------------+---------------+\n        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |\n        +-----------------------+---------------+------------+---------------+\n\n    Args:\n        in_channels (int): Channels of the input feature map.\n    \"\"\"\n\n    def __init__(self, in_channels: int) -> None:\n        super().__init__()\n        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)\n        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)\n        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)\n        self.gamma = Scale(0.)\n        self.in_channels = in_channels\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        \"\"\"Forward function of Criss-Cross Attention.\n\n        Args:\n            x (torch.Tensor): Input feature with the shape of\n                (batch_size, in_channels, height, width).\n\n        Returns:\n            torch.Tensor: Output of the layer, with the shape of\n            (batch_size, in_channels, height, width)\n        \"\"\"\n        B, C, H, W = x.size()\n        query = self.query_conv(x)\n        key = self.key_conv(x)\n        value = self.value_conv(x)\n        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(\n            H, query.device)\n        energy_H = energy_H.transpose(1, 2)\n        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)\n        attn = F.softmax(\n            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]\n        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])\n        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])\n\n        out = self.gamma(out) + x\n        out = out.contiguous()\n\n        return out\n\n    def __repr__(self) -> str:\n        s = self.__class__.__name__\n        s += f'(in_channels={self.in_channels})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/chamfer_distance.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Sequence, Tuple\n\nimport torch\nfrom torch import Tensor\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])\n\n\nclass ChamferDistanceFunction(Function):\n    \"\"\"This is an implementation of the 2D Chamfer Distance.\n\n    It has been used in the paper `Oriented RepPoints for Aerial Object\n    Detection (CVPR 2022)\n    <https://arxiv.org/abs/2105.11111>_`.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:\n        \"\"\"\n        Args:\n            xyz1 (Tensor): Point set with shape (B, N, 2).\n            xyz2 (Tensor): Point set with shape (B, N, 2).\n\n        Returns:\n            Sequence[Tensor]:\n\n                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with\n                    shape (B, N).\n                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with\n                    shape (B, N).\n                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)\n                    with shape (B, N), which be used in compute gradient.\n                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)\n                    with shape (B, N), which be used in compute gradient.\n        \"\"\"\n        batch_size, n, _ = xyz1.size()\n        _, m, _ = xyz2.size()\n        device = xyz1.device\n        xyz1 = xyz1.contiguous()\n        xyz2 = xyz2.contiguous()\n\n        dist1 = torch.zeros(batch_size, n).type(xyz1.dtype).to(device)\n        dist2 = torch.zeros(batch_size, m).type(xyz2.dtype).to(device)\n        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)\n        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)\n\n        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,\n                                            idx2)\n        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)\n        return dist1, dist2, idx1, idx2\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx,\n                 grad_dist1: Tensor,\n                 grad_dist2: Tensor,\n                 grad_idx1=None,\n                 grad_idx2=None) -> Tuple[Tensor, Tensor]:\n        \"\"\"\n\n        Args:\n            grad_dist1 (Tensor): Gradient of chamfer distance\n                (xyz1 to xyz2) with shape (B, N).\n            grad_dist2 (Tensor): Gradient of chamfer distance\n                (xyz2 to xyz1) with shape (B, N).\n\n        Returns:\n            Tuple[Tensor, Tensor]:\n\n            - grad_xyz1 (Tensor): Gradient of the point set with shape \\\n                (B, N, 2).\n            - grad_xyz2 (Tensor):Gradient of the point set with shape \\\n                (B, N, 2).\n        \"\"\"\n        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors\n        device = grad_dist1.device\n        grad_dist1 = grad_dist1.contiguous()\n        grad_dist2 = grad_dist2.contiguous()\n        grad_xyz1 = torch.zeros(xyz1.size()).type(xyz1.dtype).to(device)\n        grad_xyz2 = torch.zeros(xyz2.size()).type(xyz2.dtype).to(device)\n\n        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,\n                                             grad_dist1, grad_dist2, grad_xyz1,\n                                             grad_xyz2)\n        return grad_xyz1, grad_xyz2\n\n\nchamfer_distance = ChamferDistanceFunction.apply\n"
  },
  {
    "path": "mmcv/ops/contour_expand.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Union\n\nimport numpy as np\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['contour_expand'])\n\n\ndef contour_expand(kernel_mask: Union[np.array, torch.Tensor],\n                   internal_kernel_label: Union[np.array, torch.Tensor],\n                   min_kernel_area: int, kernel_num: int) -> list:\n    \"\"\"Expand kernel contours so that foreground pixels are assigned into\n    instances.\n\n    Args:\n        kernel_mask (np.array or torch.Tensor): The instance kernel mask with\n            size hxw.\n        internal_kernel_label (np.array or torch.Tensor): The instance internal\n            kernel label with size hxw.\n        min_kernel_area (int): The minimum kernel area.\n        kernel_num (int): The instance kernel number.\n\n    Returns:\n        list: The instance index map with size hxw.\n    \"\"\"\n    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))\n    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))\n    assert isinstance(min_kernel_area, int)\n    assert isinstance(kernel_num, int)\n\n    if isinstance(kernel_mask, np.ndarray):\n        kernel_mask = torch.from_numpy(kernel_mask)\n    if isinstance(internal_kernel_label, np.ndarray):\n        internal_kernel_label = torch.from_numpy(internal_kernel_label)\n\n    if torch.__version__ == 'parrots':\n        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:\n            label = []\n        else:\n            label = ext_module.contour_expand(\n                kernel_mask,\n                internal_kernel_label,\n                min_kernel_area=min_kernel_area,\n                kernel_num=kernel_num)\n            label = label.tolist()  # type: ignore\n    else:\n        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,\n                                          min_kernel_area, kernel_num)\n    return label\n"
  },
  {
    "path": "mmcv/ops/conv2d_gradfix.py",
    "content": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors retain all intellectual property\n# and proprietary rights in and to this software, related documentation\n# and any modifications thereto.  Any use, reproduction, disclosure or\n# distribution of this software and related documentation without an express\n# license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa\n\"\"\"Custom replacement for `torch.nn.functional.conv2d` that supports\narbitrarily high order gradients with zero performance penalty.\"\"\"\n\nimport contextlib\nimport warnings\nfrom typing import Dict, Optional, Tuple, Union\n\nimport torch\nfrom mmengine.device import is_musa_available\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch\n\nenabled = True\nweight_gradients_disabled = False\n\n\n@contextlib.contextmanager\ndef no_weight_gradients(disable=True):\n    global weight_gradients_disabled\n    old = weight_gradients_disabled\n    if disable:\n        weight_gradients_disabled = True\n    yield\n    weight_gradients_disabled = old\n\n\ndef conv2d(input: torch.Tensor,\n           weight: torch.Tensor,\n           bias: Optional[torch.Tensor] = None,\n           stride: Union[int, Tuple[int, ...]] = 1,\n           padding: Union[int, Tuple[int, ...]] = 0,\n           dilation: Union[int, Tuple[int, ...]] = 1,\n           groups: int = 1):\n    flag = True\n    if digit_version(torch.__version__) >= digit_version('1.10.0'):\n        warnings.warn('Since '\n                      'aten:cudnn_convolution_backward_weight is '\n                      f'not supported in torch=={torch.__version__},'\n                      ' rolling back to `torch.nn.functional.conv2d`')\n        flag = False\n    if _should_use_custom_op(input) and flag:\n        return _conv2d_gradfix(\n            transpose=False,\n            weight_shape=weight.shape,\n            stride=stride,\n            padding=padding,\n            output_padding=0,\n            dilation=dilation,\n            groups=groups).apply(input, weight, bias)\n    return torch.nn.functional.conv2d(\n        input=input,\n        weight=weight,\n        bias=bias,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        groups=groups)\n\n\ndef conv_transpose2d(input: torch.Tensor,\n                     weight: torch.Tensor,\n                     bias: Optional[torch.Tensor] = None,\n                     stride: Union[int, Tuple[int, ...]] = 1,\n                     padding: Union[int, Tuple[int, ...]] = 0,\n                     output_padding: Union[int, Tuple[int, ...]] = 0,\n                     groups: int = 1,\n                     dilation: Union[int, Tuple[int, ...]] = 1):\n    if _should_use_custom_op(input):\n        return _conv2d_gradfix(\n            transpose=True,\n            weight_shape=weight.shape,\n            stride=stride,\n            padding=padding,\n            output_padding=output_padding,\n            groups=groups,\n            dilation=dilation).apply(input, weight, bias)\n    return torch.nn.functional.conv_transpose2d(\n        input=input,\n        weight=weight,\n        bias=bias,\n        stride=stride,\n        padding=padding,\n        output_padding=output_padding,\n        groups=groups,\n        dilation=dilation)\n\n\ndef _should_use_custom_op(input):\n    assert isinstance(input, torch.Tensor)\n    if enabled and is_musa_available():\n        return True\n    if (not enabled) or (not torch.backends.cudnn.enabled):\n        return False\n    if input.device.type != 'cuda':\n        return False\n    return True\n\n\ndef _to_tuple(x, ndim):\n    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim\n    assert len(xs) == ndim\n    assert all(isinstance(x, int) for x in xs)\n    return xs\n\n\n_conv2d_gradfix_cache: Dict = dict()\n_null_tensor = torch.empty([0])\n\n\ndef _conv2d_gradfix(\n    transpose: bool,\n    weight_shape: Tuple[int, ...],\n    stride: Union[int, Tuple[int, ...]],\n    padding: Union[int, Tuple[int, ...]],\n    output_padding: Union[int, Tuple[int, ...]],\n    dilation: Union[int, Tuple[int, ...]],\n    groups: int,\n):\n    # Parse arguments.\n    ndim = 2\n    weight_shape = tuple(weight_shape)\n    stride = _to_tuple(stride, ndim)\n    padding = _to_tuple(padding, ndim)\n    output_padding = _to_tuple(output_padding, ndim)\n    dilation = _to_tuple(dilation, ndim)\n\n    # Lookup from cache.\n    key = (transpose, weight_shape, stride, padding, output_padding, dilation,\n           groups)\n    if key in _conv2d_gradfix_cache:\n        return _conv2d_gradfix_cache[key]\n\n    # Validate arguments.\n\n    assert groups >= 1\n    assert len(weight_shape) == ndim + 2\n    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore\n    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore\n    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore\n    if not transpose:\n        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore\n    else:  # transpose\n        for i in range(ndim):\n            assert 0 <= output_padding[i] < max(  # type: ignore\n                stride[i],  # type: ignore\n                dilation[i])  # type: ignore\n\n    # Helpers.\n    common_kwargs = dict(\n        stride=stride, padding=padding, dilation=dilation, groups=groups)\n\n    def calc_output_padding(input_shape, output_shape):\n        if transpose:\n            return [0, 0]\n        return [\n            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -\n            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)\n            for i in range(ndim)\n        ]\n\n    # Forward & backward.\n    class Conv2d(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, input, weight, bias):\n            assert weight.shape == weight_shape\n            ctx.save_for_backward(\n                input if weight.requires_grad else _null_tensor,\n                weight if input.requires_grad else _null_tensor,\n            )\n            ctx.input_shape = input.shape\n\n            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).\n            if (not is_musa_available()\n                ) and weight_shape[2:] == stride == dilation == (\n                    1, 1) and padding == (\n                        0, 0) and torch.cuda.get_device_capability(\n                            input.device) < (8, 0):\n                a = weight.reshape(groups, weight_shape[0] // groups,\n                                   weight_shape[1])\n                b = input.reshape(input.shape[0], groups,\n                                  input.shape[1] // groups, -1)\n                c = (a.transpose(1, 2) if transpose else a) @ b.permute(\n                    1, 2, 0, 3).flatten(2)\n                c = c.reshape(-1, input.shape[0],\n                              *input.shape[2:]).transpose(0, 1)\n                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(\n                    2).unsqueeze(3)\n                return c.contiguous(\n                    memory_format=(torch.channels_last if input.stride(1) ==\n                                   1 else torch.contiguous_format))\n\n            # General case => cuDNN.\n            if transpose:\n                return torch.nn.functional.conv_transpose2d(\n                    input=input,\n                    weight=weight,\n                    bias=bias,\n                    output_padding=output_padding,\n                    **common_kwargs)\n            return torch.nn.functional.conv2d(\n                input=input, weight=weight, bias=bias, **common_kwargs)\n\n        @staticmethod\n        def backward(ctx, grad_output):\n            input, weight = ctx.saved_tensors\n            input_shape = ctx.input_shape\n            grad_input = None\n            grad_weight = None\n            grad_bias = None\n\n            if ctx.needs_input_grad[0]:\n                p = calc_output_padding(\n                    input_shape=input_shape, output_shape=grad_output.shape)\n                op = _conv2d_gradfix(\n                    transpose=(not transpose),\n                    weight_shape=weight_shape,\n                    output_padding=p,\n                    **common_kwargs)\n                grad_input = op.apply(grad_output, weight, None)\n                assert grad_input.shape == input_shape\n\n            if ctx.needs_input_grad[1] and not weight_gradients_disabled:\n                grad_weight = Conv2dGradWeight.apply(grad_output, input)\n                assert grad_weight.shape == weight_shape\n\n            if ctx.needs_input_grad[2]:\n                grad_bias = grad_output.sum([0, 2, 3])\n\n            return grad_input, grad_weight, grad_bias\n\n    # Gradient with respect to the weights.\n    class Conv2dGradWeight(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, grad_output, input):\n            ctx.save_for_backward(\n                grad_output if input.requires_grad else _null_tensor,\n                input if grad_output.requires_grad else _null_tensor,\n            )\n            ctx.grad_output_shape = grad_output.shape\n            ctx.input_shape = input.shape\n\n            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).\n            if weight_shape[2:] == stride == dilation == (\n                    1, 1) and padding == (0, 0):\n                a = grad_output.reshape(grad_output.shape[0], groups,\n                                        grad_output.shape[1] // groups,\n                                        -1).permute(1, 2, 0, 3).flatten(2)\n                b = input.reshape(input.shape[0], groups,\n                                  input.shape[1] // groups,\n                                  -1).permute(1, 2, 0, 3).flatten(2)\n                c = (b @ a.transpose(1, 2) if transpose else\n                     a @ b.transpose(1, 2)).reshape(weight_shape)\n                return c.contiguous(\n                    memory_format=(torch.channels_last if input.stride(1) ==\n                                   1 else torch.contiguous_format))\n\n            # PyTorch consolidated convolution backward API in PR:\n            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501\n            # Enhance the code referring to the discussion:\n            # https://github.com/pytorch/pytorch/issues/74437\n            if digit_version(torch.__version__) >= digit_version('1.11.0'):\n                empty_weight = torch.tensor(\n                    0.0, dtype=input.dtype,\n                    device=input.device).expand(weight_shape)\n                output_padding = calc_output_padding(input.shape,\n                                                     grad_output.shape)\n                return torch.ops.aten.convolution_backward(\n                    grad_output,\n                    input,\n                    empty_weight,\n                    None,\n                    stride=stride,\n                    dilation=dilation,\n                    transposed=transpose,\n                    padding=padding,\n                    groups=groups,\n                    output_padding=output_padding,\n                    output_mask=[0, 1, 0])[1]\n            else:\n                if is_rocm_pytorch():\n                    name = 'aten::miopen_convolution_transpose_backward_weight'\n                    if not transpose:\n                        name = 'aten::miopen_convolution_backward_weight'\n                    flags = [\n                        torch.backends.cudnn.benchmark,\n                        torch.backends.cudnn.deterministic\n                    ]\n                else:\n                    # General case => cuDNN.\n                    name = ('aten::cudnn_convolution_transpose_backward_weight'\n                            if transpose else\n                            'aten::cudnn_convolution_backward_weight')\n                    flags = [\n                        torch.backends.cudnn.benchmark,\n                        torch.backends.cudnn.deterministic,\n                        torch.backends.cudnn.allow_tf32\n                    ]\n                return torch._C._jit_get_operation(name)(weight_shape,\n                                                         grad_output, input,\n                                                         padding, stride,\n                                                         dilation, groups,\n                                                         *flags)\n\n        @staticmethod\n        def backward(ctx, grad2_grad_weight):\n            grad_output, input = ctx.saved_tensors\n            grad_output_shape = ctx.grad_output_shape\n            input_shape = ctx.input_shape\n            grad2_grad_output = None\n            grad2_input = None\n\n            if ctx.needs_input_grad[0]:\n                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,\n                                                 None)\n                assert grad2_grad_output.shape == grad_output_shape\n\n            if ctx.needs_input_grad[1]:\n                p = calc_output_padding(\n                    input_shape=input_shape, output_shape=grad_output_shape)\n                op = _conv2d_gradfix(\n                    transpose=(not transpose),\n                    weight_shape=weight_shape,\n                    output_padding=p,\n                    **common_kwargs)\n                grad2_input = op.apply(grad_output, grad2_grad_weight, None)\n                assert grad2_input.shape == input_shape\n\n            return grad2_grad_output, grad2_input\n\n    _conv2d_gradfix_cache[key] = Conv2d\n    return Conv2d\n"
  },
  {
    "path": "mmcv/ops/convex_iou.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple\n\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])\n\n\ndef convex_giou(pointsets: torch.Tensor,\n                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Return generalized intersection-over-union (Jaccard index) between point\n    sets and polygons.\n\n    Args:\n        pointsets (torch.Tensor): It has shape (N, 18),\n            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.\n        polygons (torch.Tensor): It has shape (N, 8),\n            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.\n\n    Returns:\n        tuple[torch.Tensor, torch.Tensor]: The first element is the gious\n        between point sets and polygons with the shape (N,). The second\n        element is the gradient of point sets with the shape (N, 18).\n    \"\"\"\n    output = pointsets.new_zeros((pointsets.size(0), 19))\n    ext_module.convex_giou(pointsets, polygons, output)\n    convex_giou = output[:, -1]\n    points_grad = output[:, 0:-1]\n    return convex_giou, points_grad\n\n\ndef convex_iou(pointsets: torch.Tensor,\n               polygons: torch.Tensor) -> torch.Tensor:\n    \"\"\"Return intersection-over-union (Jaccard index) between point sets and\n    polygons.\n\n    Args:\n        pointsets (torch.Tensor): It has shape (N, 18),\n            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.\n        polygons (torch.Tensor): It has shape (K, 8),\n            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.\n\n    Returns:\n        torch.Tensor: Return the ious between point sets and polygons with the\n        shape (N, K).\n    \"\"\"\n    N, K = pointsets.size(0), polygons.size(0)\n    ious = pointsets.new_zeros((N, K))\n    ext_module.convex_iou(pointsets, polygons, ious)\n    return ious\n"
  },
  {
    "path": "mmcv/ops/corner_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nfrom mmengine.utils import digit_version\nfrom torch import Tensor, nn\n\n_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}\n\n\ndef _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:\n    size = x.size(dim)\n    output = x.clone()\n\n    ind = 1\n    while ind < size:\n        if flip:\n            cur_start = 0\n            cur_len = size - ind\n            next_start = ind\n            next_len = size - ind\n        else:\n            cur_start = ind\n            cur_len = size - ind\n            next_start = 0\n            next_len = size - ind\n\n        # max_temp should be cloned for backward computation\n        max_temp = output.narrow(dim, cur_start, cur_len).clone()\n        cur_temp = output.narrow(dim, cur_start, cur_len)\n        next_temp = output.narrow(dim, next_start, next_len)\n\n        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)\n\n        ind = ind << 1\n\n    return output\n\n\nclass CornerPool(nn.Module):\n    \"\"\"Corner Pooling.\n\n    Corner Pooling is a new type of pooling layer that helps a\n    convolutional network better localize corners of bounding boxes.\n\n    Please refer to `CornerNet: Detecting Objects as Paired Keypoints\n    <https://arxiv.org/abs/1808.01244>`_ for more details.\n\n    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.\n\n    Args:\n        mode (str): Pooling orientation for the pooling layer\n\n            - 'bottom': Bottom Pooling\n            - 'left': Left Pooling\n            - 'right': Right Pooling\n            - 'top': Top Pooling\n\n    Returns:\n        Feature map after pooling.\n    \"\"\"\n\n    cummax_dim_flip = {\n        'bottom': (2, False),\n        'left': (3, True),\n        'right': (3, False),\n        'top': (2, True),\n    }\n\n    def __init__(self, mode: str):\n        super().__init__()\n        assert mode in self.cummax_dim_flip\n        self.mode = mode\n\n    def forward(self, x: Tensor) -> Tensor:\n        if (torch.__version__ != 'parrots' and\n                digit_version(torch.__version__) >= digit_version('1.5.0')):\n            dim, flip = self.cummax_dim_flip[self.mode]\n            if flip:\n                x = x.flip(dim)\n            pool_tensor, _ = torch.cummax(x, dim=dim)\n            if flip:\n                pool_tensor = pool_tensor.flip(dim)\n            return pool_tensor\n        else:\n            dim, flip = self.cummax_dim_flip[self.mode]\n            return _corner_pool(x, dim, flip)\n"
  },
  {
    "path": "mmcv/ops/correlation.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple\n\nimport torch\nfrom torch import Tensor, nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['correlation_forward', 'correlation_backward'])\n\n\nclass CorrelationFunction(Function):\n\n    @staticmethod\n    def forward(ctx,\n                input1: Tensor,\n                input2: Tensor,\n                kernel_size: int = 1,\n                max_displacement: int = 1,\n                stride: int = 1,\n                padding: int = 1,\n                dilation: int = 1,\n                dilation_patch: int = 1) -> Tensor:\n\n        ctx.save_for_backward(input1, input2)\n\n        kH, kW = ctx.kernel_size = _pair(kernel_size)\n        patch_size = max_displacement * 2 + 1\n        ctx.patch_size = patch_size\n        dH, dW = ctx.stride = _pair(stride)\n        padH, padW = ctx.padding = _pair(padding)\n        dilationH, dilationW = ctx.dilation = _pair(dilation)\n        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(\n            dilation_patch)\n\n        output_size = CorrelationFunction._output_size(ctx, input1)\n\n        output = input1.new_zeros(output_size)\n\n        ext_module.correlation_forward(\n            input1,\n            input2,\n            output,\n            kH=kH,\n            kW=kW,\n            patchH=patch_size,\n            patchW=patch_size,\n            padH=padH,\n            padW=padW,\n            dilationH=dilationH,\n            dilationW=dilationW,\n            dilation_patchH=dilation_patchH,\n            dilation_patchW=dilation_patchW,\n            dH=dH,\n            dW=dW)\n\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(\n        ctx, grad_output: Tensor\n    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:\n        input1, input2 = ctx.saved_tensors\n\n        kH, kW = ctx.kernel_size\n        patch_size = ctx.patch_size\n        padH, padW = ctx.padding\n        dilationH, dilationW = ctx.dilation\n        dilation_patchH, dilation_patchW = ctx.dilation_patch\n        dH, dW = ctx.stride\n        grad_input1 = torch.zeros_like(input1)\n        grad_input2 = torch.zeros_like(input2)\n\n        ext_module.correlation_backward(\n            grad_output,\n            input1,\n            input2,\n            grad_input1,\n            grad_input2,\n            kH=kH,\n            kW=kW,\n            patchH=patch_size,\n            patchW=patch_size,\n            padH=padH,\n            padW=padW,\n            dilationH=dilationH,\n            dilationW=dilationW,\n            dilation_patchH=dilation_patchH,\n            dilation_patchW=dilation_patchW,\n            dH=dH,\n            dW=dW)\n        return grad_input1, grad_input2, None, None, None, None, None, None\n\n    @staticmethod\n    def _output_size(ctx, input1):\n        iH, iW = input1.size(2), input1.size(3)\n        batch_size = input1.size(0)\n        kH, kW = ctx.kernel_size\n        patch_size = ctx.patch_size\n        dH, dW = ctx.stride\n        padH, padW = ctx.padding\n        dilationH, dilationW = ctx.dilation\n        dilatedKH = (kH - 1) * dilationH + 1\n        dilatedKW = (kW - 1) * dilationW + 1\n\n        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)\n        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)\n\n        output_size = (batch_size, patch_size, patch_size, oH, oW)\n        return output_size\n\n\nclass Correlation(nn.Module):\n    r\"\"\"Correlation operator.\n\n    This correlation operator works for optical flow correlation computation.\n\n    There are two batched tensors with shape :math:`(N, C, H, W)`,\n    and the correlation output's shape is :math:`(N, max\\_displacement \\times\n    2 + 1, max\\_displacement * 2 + 1, H_{out}, W_{out})`\n\n    where\n\n    .. math::\n        H_{out} = \\left\\lfloor\\frac{H_{in}  + 2 \\times padding -\n            dilation \\times (kernel\\_size - 1) - 1}\n            {stride} + 1\\right\\rfloor\n\n    .. math::\n        W_{out} = \\left\\lfloor\\frac{W_{in}  + 2 \\times padding - dilation\n            \\times (kernel\\_size - 1) - 1}\n            {stride} + 1\\right\\rfloor\n\n    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding\n    window convolution between input1 and shifted input2,\n\n    .. math::\n        Corr(N_i, dx, dy) =\n        \\sum_{c=0}^{C-1}\n        input1(N_i, c) \\star\n        \\mathcal{S}(input2(N_i, c), dy, dx)\n\n    where :math:`\\star` is the valid 2d sliding window convolution operator,\n    and :math:`\\mathcal{S}` means shifting the input features (auto-complete\n    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \\in\n    [-max\\_displacement \\times dilation\\_patch, max\\_displacement \\times\n    dilation\\_patch]`.\n\n    Args:\n        kernel_size (int): The size of sliding window i.e. local neighborhood\n            representing the center points and involved in correlation\n            computation. Defaults to 1.\n        max_displacement (int): The radius for computing correlation volume,\n            but the actual working space can be dilated by dilation_patch.\n            Defaults to 1.\n        stride (int): The stride of the sliding blocks in the input spatial\n            dimensions. Defaults to 1.\n        padding (int): Zero padding added to all four sides of the input1.\n            Defaults to 0.\n        dilation (int): The spacing of local neighborhood that will involved\n            in correlation. Defaults to 1.\n        dilation_patch (int): The spacing between position need to compute\n            correlation.  Defaults to 1.\n    \"\"\"\n\n    def __init__(self,\n                 kernel_size: int = 1,\n                 max_displacement: int = 1,\n                 stride: int = 1,\n                 padding: int = 0,\n                 dilation: int = 1,\n                 dilation_patch: int = 1) -> None:\n        super().__init__()\n        self.kernel_size = kernel_size\n        self.max_displacement = max_displacement\n        self.stride = stride\n        self.padding = padding\n        self.dilation = dilation\n        self.dilation_patch = dilation_patch\n\n    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:\n        return CorrelationFunction.apply(input1, input2, self.kernel_size,\n                                         self.max_displacement, self.stride,\n                                         self.padding, self.dilation,\n                                         self.dilation_patch)\n\n    def __repr__(self) -> str:\n        s = self.__class__.__name__\n        s += f'(kernel_size={self.kernel_size}, '\n        s += f'max_displacement={self.max_displacement}, '\n        s += f'stride={self.stride}, '\n        s += f'padding={self.padding}, '\n        s += f'dilation={self.dilation}, '\n        s += f'dilation_patch={self.dilation_patch})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/csrc/README.md",
    "content": "# Code Structure of CUDA operators\n\nThis folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.\n\n## Directories Tree\n\n```folder\n.\n├── common\n│   ├── box_iou_rotated_utils.hpp\n│   ├── parrots_cpp_helper.hpp\n│   ├── parrots_cuda_helper.hpp\n│   ├── pytorch_cpp_helper.hpp\n│   ├── pytorch_cuda_helper.hpp\n│   ├── pytorch_device_registry.hpp\n│   ├── cuda\n│   │   ├── common_cuda_helper.hpp\n│   │   ├── parrots_cudawarpfunction.cuh\n│   │   ├── ...\n│   │   └── ops_cuda_kernel.cuh\n|   ├── mps\n│   │   ├── MPSLibrary.h\n│   │   ├── ...\n│   │   └── MPSUtils.h\n|   ├── mlu\n│   │   └── ...\n|   └── utils\n│   │   └── ...\n├── parrots\n│   ├── ...\n│   ├── ops.cpp\n│   ├── ops_parrots.cpp\n│   └── ops_pytorch.h\n└── pytorch\n    ├── info.cpp\n    ├── pybind.cpp\n    ├── ...\n    ├── ops.cpp\n    ├── cuda\n    │   ├── ...\n    │   └── ops_cuda.cu\n    ├── cpu\n    │   ├── ...\n    │   └── ops.cpp\n    ├── mps\n    │   ├── ...\n    |   └── op_mps.mm\n    └── mlu\n        ├── ...\n        └── op_mlu.cpp\n```\n\n## Components\n\n- `common`: This directory contains all tools and shared codes.\n  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.\n  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.\n  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.\n  - `utils`: The kernels and utils of spconv.\n- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.\n- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.\n  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.\n  - `cpu`: This directory contain cpu implementations of corresponding custom ops.\n  - `mlu`: This directory contain launchers of each MLU kernels.\n  - `mps`: MPS ops implementation and launchers.\n\n## How to add new PyTorch ops?\n\n1. (Optional) Add shared kernel in `common` to support special hardware platform.\n\n   ```c++\n   // src/common/cuda/new_ops_cuda_kernel.cuh\n\n   template <typename T>\n   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {\n       // forward here\n   }\n\n   ```\n\n   Add cuda kernel launcher in `pytorch/cuda`.\n\n   ```c++\n   // src/pytorch/cuda\n   #include <new_ops_cuda_kernel.cuh>\n\n   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){\n       // initialize\n       at::cuda::CUDAGuard device_guard(input.device());\n       cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n       ...\n       AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n           input.scalar_type(), \"new_ops_forward_cuda_kernel\", ([&] {\n               new_ops_forward_cuda_kernel<scalar_t>\n                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);\n           }));\n       AT_CUDA_CHECK(cudaGetLastError());\n   }\n   ```\n\n2. Register implementation for different devices.\n\n   ```c++\n   // src/pytorch/cuda/cudabind.cpp\n   ...\n\n   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){\n       // implement cuda forward here\n       // use `NewOpsForwardCUDAKernelLauncher` here\n   }\n   // declare interface here.\n   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);\n   // register the implementation for given device (CUDA here).\n   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);\n   ```\n\n3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.\n\n   ```c++\n   // src/pytorch/new_ops.cpp\n   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){\n       // dispatch the implementation according to the device type of input.\n       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);\n   }\n   ...\n\n   Tensor new_ops_forward(Tensor input, Tensor output, ...){\n       return new_ops_forward_impl(input, output, ...);\n   }\n   ```\n\n4. Binding the implementation in `pytorch/pybind.cpp`\n\n   ```c++\n   // src/pytorch/pybind.cpp\n\n   ...\n\n   Tensor new_ops_forward(Tensor input, Tensor output, ...);\n\n   ...\n\n   // bind with pybind11\n   m.def(\"new_ops_forward\", &new_ops_forward, \"new_ops_forward\",\n           py::arg(\"input\"), py::arg(\"output\"), ...);\n\n   ...\n\n   ```\n\n5. Build MMCV again. Enjoy new ops in python\n\n   ```python\n   from ..utils import ext_loader\n   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])\n\n   ...\n\n   ext_module.new_ops_forward(input, output, ...)\n\n   ```\n"
  },
  {
    "path": "mmcv/ops/csrc/common/box_iou_rotated_utils.hpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h\n#pragma once\n#include <cassert>\n#include <cmath>\n\n#if defined(__CUDACC__) || defined(__MUSACC__)\n// Designates functions callable from the host (CPU) and the device (GPU)\n#define HOST_DEVICE __host__ __device__\n#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__\n#else\n#include <algorithm>\n#define HOST_DEVICE\n#define HOST_DEVICE_INLINE HOST_DEVICE inline\n#endif\n\nnamespace {\n\ntemplate <typename T>\nstruct RotatedBox {\n  T x_ctr, y_ctr, w, h, a;\n};\n\ntemplate <typename T>\nstruct Point {\n  T x, y;\n  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}\n  HOST_DEVICE_INLINE Point operator+(const Point& p) const {\n    return Point(x + p.x, y + p.y);\n  }\n  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {\n    x += p.x;\n    y += p.y;\n    return *this;\n  }\n  HOST_DEVICE_INLINE Point operator-(const Point& p) const {\n    return Point(x - p.x, y - p.y);\n  }\n  HOST_DEVICE_INLINE Point operator*(const T coeff) const {\n    return Point(x * coeff, y * coeff);\n  }\n};\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {\n  return A.x * B.x + A.y * B.y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {\n  return A.x * B.y - B.x * A.y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,\n                                             Point<T> (&pts)[4]) {\n  // M_PI / 180. == 0.01745329251\n  // double theta = box.a * 0.01745329251;\n  // MODIFIED\n  double theta = box.a;\n  T cosTheta2 = (T)cos(theta) * 0.5f;\n  T sinTheta2 = (T)sin(theta) * 0.5f;\n\n  // y: top --> down; x: left --> right\n  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;\n  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;\n  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;\n  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;\n  pts[2].x = 2 * box.x_ctr - pts[0].x;\n  pts[2].y = 2 * box.y_ctr - pts[0].y;\n  pts[3].x = 2 * box.x_ctr - pts[1].x;\n  pts[3].y = 2 * box.y_ctr - pts[1].y;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],\n                                               const Point<T> (&pts2)[4],\n                                               Point<T> (&intersections)[24]) {\n  // Line vector\n  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]\n  Point<T> vec1[4], vec2[4];\n  for (int i = 0; i < 4; i++) {\n    vec1[i] = pts1[(i + 1) % 4] - pts1[i];\n    vec2[i] = pts2[(i + 1) % 4] - pts2[i];\n  }\n\n  // Line test - test all line combos for intersection\n  int num = 0;  // number of intersections\n  for (int i = 0; i < 4; i++) {\n    for (int j = 0; j < 4; j++) {\n      // Solve for 2x2 Ax=b\n      T det = cross_2d<T>(vec2[j], vec1[i]);\n\n      // This takes care of parallel lines\n      if (fabs(det) <= 1e-14) {\n        continue;\n      }\n\n      auto vec12 = pts2[j] - pts1[i];\n\n      T t1 = cross_2d<T>(vec2[j], vec12) / det;\n      T t2 = cross_2d<T>(vec1[i], vec12) / det;\n\n      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {\n        intersections[num++] = pts1[i] + vec1[i] * t1;\n      }\n    }\n  }\n\n  // Check for vertices of rect1 inside rect2\n  {\n    const auto& AB = vec2[0];\n    const auto& DA = vec2[3];\n    auto ABdotAB = dot_2d<T>(AB, AB);\n    auto ADdotAD = dot_2d<T>(DA, DA);\n    for (int i = 0; i < 4; i++) {\n      // assume ABCD is the rectangle, and P is the point to be judged\n      // P is inside ABCD iff. P's projection on AB lies within AB\n      // and P's projection on AD lies within AD\n\n      auto AP = pts1[i] - pts2[0];\n\n      auto APdotAB = dot_2d<T>(AP, AB);\n      auto APdotAD = -dot_2d<T>(AP, DA);\n\n      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&\n          (APdotAD <= ADdotAD)) {\n        intersections[num++] = pts1[i];\n      }\n    }\n  }\n\n  // Reverse the check - check for vertices of rect2 inside rect1\n  {\n    const auto& AB = vec1[0];\n    const auto& DA = vec1[3];\n    auto ABdotAB = dot_2d<T>(AB, AB);\n    auto ADdotAD = dot_2d<T>(DA, DA);\n    for (int i = 0; i < 4; i++) {\n      auto AP = pts2[i] - pts1[0];\n\n      auto APdotAB = dot_2d<T>(AP, AB);\n      auto APdotAD = -dot_2d<T>(AP, DA);\n\n      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&\n          (APdotAD <= ADdotAD)) {\n        intersections[num++] = pts2[i];\n      }\n    }\n  }\n\n  return num;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],\n                                          const int& num_in, Point<T> (&q)[24],\n                                          bool shift_to_zero = false) {\n  assert(num_in >= 2);\n\n  // Step 1:\n  // Find point with minimum y\n  // if more than 1 points have the same minimum y,\n  // pick the one with the minimum x.\n  int t = 0;\n  for (int i = 1; i < num_in; i++) {\n    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {\n      t = i;\n    }\n  }\n  auto& start = p[t];  // starting point\n\n  // Step 2:\n  // Subtract starting point from every points (for sorting in the next step)\n  for (int i = 0; i < num_in; i++) {\n    q[i] = p[i] - start;\n  }\n\n  // Swap the starting point to position 0\n  auto tmp = q[0];\n  q[0] = q[t];\n  q[t] = tmp;\n\n  // Step 3:\n  // Sort point 1 ~ num_in according to their relative cross-product values\n  // (essentially sorting according to angles)\n  // If the angles are the same, sort according to their distance to origin\n  T dist[24];\n  for (int i = 0; i < num_in; i++) {\n    dist[i] = dot_2d<T>(q[i], q[i]);\n  }\n\n#if defined(__CUDACC__) || defined(__MUSACC__)\n  // CUDA version\n  // In the future, we can potentially use thrust\n  // for sorting here to improve speed (though not guaranteed)\n  for (int i = 1; i < num_in - 1; i++) {\n    for (int j = i + 1; j < num_in; j++) {\n      T crossProduct = cross_2d<T>(q[i], q[j]);\n      if ((crossProduct < -1e-6) ||\n          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {\n        auto q_tmp = q[i];\n        q[i] = q[j];\n        q[j] = q_tmp;\n        auto dist_tmp = dist[i];\n        dist[i] = dist[j];\n        dist[j] = dist_tmp;\n      }\n    }\n  }\n#else\n  // CPU version\n  std::sort(q + 1, q + num_in,\n            [](const Point<T>& A, const Point<T>& B) -> bool {\n              T temp = cross_2d<T>(A, B);\n              if (fabs(temp) < 1e-6) {\n                return dot_2d<T>(A, A) < dot_2d<T>(B, B);\n              } else {\n                return temp > 0;\n              }\n            });\n  // compute distance to origin after sort, since the points are now different.\n  for (int i = 0; i < num_in; i++) {\n    dist[i] = dot_2d<T>(q[i], q[i]);\n  }\n#endif\n\n  // Step 4:\n  // Make sure there are at least 2 points (that don't overlap with each other)\n  // in the stack\n  int k;  // index of the non-overlapped second point\n  for (k = 1; k < num_in; k++) {\n    if (dist[k] > 1e-8) {\n      break;\n    }\n  }\n  if (k == num_in) {\n    // We reach the end, which means the convex hull is just one point\n    q[0] = p[t];\n    return 1;\n  }\n  q[1] = q[k];\n  int m = 2;  // 2 points in the stack\n  // Step 5:\n  // Finally we can start the scanning process.\n  // When a non-convex relationship between the 3 points is found\n  // (either concave shape or duplicated points),\n  // we pop the previous point from the stack\n  // until the 3-point relationship is convex again, or\n  // until the stack only contains two points\n  for (int i = k + 1; i < num_in; i++) {\n    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {\n      m--;\n    }\n    q[m++] = q[i];\n  }\n\n  // Step 6 (Optional):\n  // In general sense we need the original coordinates, so we\n  // need to shift the points back (reverting Step 2)\n  // But if we're only interested in getting the area/perimeter of the shape\n  // We can simply return.\n  if (!shift_to_zero) {\n    for (int i = 0; i < m; i++) {\n      q[i] += start;\n    }\n  }\n\n  return m;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {\n  T area = 0;\n#pragma unroll\n  for (int i = 1; i < 3; i++) {\n    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));\n  }\n\n  return area / 2.0;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {\n  if (m <= 2) {\n    return 0;\n  }\n\n  T area = 0;\n  for (int i = 1; i < m - 1; i++) {\n    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));\n  }\n\n  return area / 2.0;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,\n                                                const RotatedBox<T>& box2) {\n  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned\n  // from rotated_rect_intersection_pts\n  Point<T> intersectPts[24], orderedPts[24];\n\n  Point<T> pts1[4];\n  Point<T> pts2[4];\n  get_rotated_vertices<T>(box1, pts1);\n  get_rotated_vertices<T>(box2, pts2);\n\n  int num = get_intersection_points<T>(pts1, pts2, intersectPts);\n\n  if (num <= 2) {\n    return 0.0;\n  }\n\n  // Convex Hull to order the intersection points in clockwise order and find\n  // the contour area.\n  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);\n  return polygon_area<T>(orderedPts, num_convex);\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],\n                                               const Point<T> (&pts2)[4]) {\n  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned\n  // from rotated_rect_intersection_pts\n  Point<T> intersectPts[24], orderedPts[24];\n\n  int num = get_intersection_points<T>(pts1, pts2, intersectPts);\n\n  if (num <= 2) {\n    return 0.0;\n  }\n\n  // Convex Hull to order the intersection points in clockwise order and find\n  // the contour area.\n  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);\n  return polygon_area<T>(orderedPts, num_convex);\n}\n\n}  // namespace\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,\n                                            T const* const box2_raw,\n                                            const int mode_flag) {\n  // shift center to the middle point to achieve higher precision in result\n  RotatedBox<T> box1, box2;\n  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;\n  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;\n  box1.x_ctr = box1_raw[0] - center_shift_x;\n  box1.y_ctr = box1_raw[1] - center_shift_y;\n  box1.w = box1_raw[2];\n  box1.h = box1_raw[3];\n  box1.a = box1_raw[4];\n  box2.x_ctr = box2_raw[0] - center_shift_x;\n  box2.y_ctr = box2_raw[1] - center_shift_y;\n  box2.w = box2_raw[2];\n  box2.h = box2_raw[3];\n  box2.a = box2_raw[4];\n\n  const T area1 = box1.w * box1.h;\n  const T area2 = box2.w * box2.h;\n  if (area1 < 1e-14 || area2 < 1e-14) {\n    return 0.f;\n  }\n\n  const T intersection = rotated_boxes_intersection<T>(box1, box2);\n  T baseS = 1.0;\n  if (mode_flag == 0) {\n    baseS = (area1 + area2 - intersection);\n  } else if (mode_flag == 1) {\n    baseS = area1;\n  }\n  const T iou = intersection / baseS;\n  return iou;\n}\n\ntemplate <typename T>\nHOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,\n                                           T const* const pts2_raw,\n                                           const int mode_flag) {\n  // shift center to the middle point to achieve higher precision in result\n  Point<T> pts1[4], pts2[4];\n\n  auto center_shift_x =\n      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +\n       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /\n      8.0;\n  auto center_shift_y =\n      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +\n       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /\n      8.0;\n  pts1[0].x = pts1_raw[0] - center_shift_x;\n  pts1[0].y = pts1_raw[1] - center_shift_y;\n  pts1[1].x = pts1_raw[2] - center_shift_x;\n  pts1[1].y = pts1_raw[3] - center_shift_y;\n  pts1[2].x = pts1_raw[4] - center_shift_x;\n  pts1[2].y = pts1_raw[5] - center_shift_y;\n  pts1[3].x = pts1_raw[6] - center_shift_x;\n  pts1[3].y = pts1_raw[7] - center_shift_y;\n  pts2[0].x = pts2_raw[0] - center_shift_x;\n  pts2[0].y = pts2_raw[1] - center_shift_y;\n  pts2[1].x = pts2_raw[2] - center_shift_x;\n  pts2[1].y = pts2_raw[3] - center_shift_y;\n  pts2[2].x = pts2_raw[4] - center_shift_x;\n  pts2[2].y = pts2_raw[5] - center_shift_y;\n  pts2[3].x = pts2_raw[6] - center_shift_x;\n  pts2[3].y = pts2_raw[7] - center_shift_y;\n\n  const T area1 = quadri_box_area<T>(pts1);\n  const T area2 = quadri_box_area<T>(pts2);\n  if (area1 < 1e-14 || area2 < 1e-14) {\n    return 0.f;\n  }\n\n  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);\n  T baseS = 1.0;\n  if (mode_flag == 0) {\n    baseS = (area1 + area2 - intersection);\n  } else if (mode_flag == 1) {\n    baseS = area1;\n  }\n  const T iou = intersection / baseS;\n  return iou;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu\n#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH\n#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename scalar_t>\n__global__ void active_rotated_filter_forward_cuda_kernel(\n    const int nthreads, const scalar_t* weight_data, const int* indices_data,\n    const int num_input_planes, const int num_output_planes,\n    const int num_orientations, const int num_rotations, const int nEntry,\n    scalar_t* output_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int l = index % nEntry;\n    int j = (index / nEntry) % num_input_planes;\n    int i = index / nEntry / num_input_planes;\n    int k;\n    scalar_t val = *(weight_data + index);\n    for (k = 0; k < num_rotations; k++) {\n      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;\n      scalar_t* target = output_data +\n                         i * (num_rotations * num_input_planes * nEntry) +\n                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;\n      *target = val;\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void active_rotated_filter_backward_cuda_kernel(\n    const int nthreads, const scalar_t* gradWeight_data,\n    const int* indices_data, const int num_input_planes,\n    const int num_output_planes, const int num_orientations,\n    const int num_rotations, const int nEntry, scalar_t* weight_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int l = index % nEntry;\n    int j = (index / nEntry) % num_input_planes;\n    int i = index / nEntry / num_input_planes;\n    int k;\n    scalar_t* val = weight_data + index;\n    *val = 0;\n    scalar_t tmp = 0;\n    for (k = 0; k < num_rotations; k++) {\n      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;\n      scalar_t target =\n          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +\n            k * (num_input_planes * nEntry) + j * (nEntry) + idx);\n      tmp = tmp + target;\n    }\n    *val = tmp;\n  }\n}\n#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH\n#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)\n// output: fout(B,O,N)\n// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)\n//       i(k) = idx(b,i,k)\n//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)\n//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k\n//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))\n\ntemplate <typename T>\n__global__ void assign_score_withk_forward_cuda_kernel(\n    const int B, const int N0, const int N1, const int M, const int K,\n    const int O, const int aggregate, const T* points, const T* centers,\n    const T* scores, const int64_t* knn_idx, T* output) {\n  // ----- parallel loop for B, N1, K and O ---------\n  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {\n    // ------- loop for M ----------\n    const int b = (int)(i / (O * N1 * K));\n    const int o = (int)(i % (O * N1 * K) / (N1 * K));\n    const int n = (int)(i % (N1 * K) / K);\n    const int k = (int)(i % K);\n    const int cn = (int)knn_idx[b * K * N1 + n * K +\n                                0];  // The first neighbor is the center point\n    const int kn = (int)knn_idx[b * K * N1 + n * K + k];\n    if (kn >= N0 ||\n        kn < 0) {  // if index overflows, it is out of the neighborhood range\n      return;\n    }\n    assert(b < B);\n    assert(kn < N0);\n    assert(cn < N0);\n    assert(o < O);\n    assert(n < N1);\n    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;\n    T val = output[out_idx];\n    for (int m = 0; m < M; m++) {\n      val += points[b * N0 * M * O + kn * M * O + m * O + o] *\n                 scores[b * N1 * K * M + n * K * M + k * M + m] -\n             centers[b * N0 * M * O + cn * M * O + m * O + o] *\n                 scores[b * N1 * K * M + n * K * M + k * M + m];\n    }\n    output[out_idx] = val;\n  }\n}\n\ntemplate <typename T>\n__global__ void assign_score_withk_points_backward_cuda_kernel(\n    const int B, const int N0, const int N, const int M, const int K,\n    const int O, const int aggregate, const T* grad_out, const T* scores,\n    const int64_t* knn_idx, T* grad_points, T* grad_centers) {\n  // ----- parallel loop for B, M, O ---------\n  CUDA_1D_KERNEL_LOOP(i, B * M * O) {\n    int b = (int)(i / (M * O));\n    int m = (int)(i % (M * O) / O);\n    int o = (int)(i % O);\n\n    // ----- loop for N,K ---------\n    for (int n = 0; n < N; n++) {\n      for (int k = 0; k < K; k++) {\n        int kn = knn_idx[b * N * K + n * K + k];\n        int cn = knn_idx[b * N * K + n * K + 0];\n        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the\n                                   // neighborhood range\n          continue;\n        }\n        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,\n                  scores[b * N * K * M + n * K * M + k * M + m] *\n                      grad_out[b * O * N * K + o * N * K + n * K + k]);\n        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,\n                  -scores[b * N * K * M + n * K * M + k * M + m] *\n                      grad_out[b * O * N * K + o * N * K + n * K + k]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void assign_score_withk_scores_backward_cuda_kernel(\n    const int B, const int N0, const int N, const int M, const int K,\n    const int O, const int aggregate, const T* grad_out, const T* points,\n    const T* centers, const int64_t* knn_idx, T* grad_scores) {\n  // ----- parallel loop for B, N, K, M ---------\n  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {\n    const int b = (int)(i / (N * M * K));\n    const int n = (int)(i % (N * M * K) / M / K);\n    const int k = (int)(i % (M * K) / M);\n    const int m = (int)(i % M);\n    const int cn = knn_idx[b * N * K + n * K + 0];\n    const int kn = knn_idx[b * N * K + n * K + k];\n    if (kn >= N0 ||\n        kn < 0) {  // if index overflows, it is out of the neighborhood range\n      return;\n    }\n\n    // -------------- loop for O ------------------------\n    const int out_idx = b * N * K * M + n * K * M + k * M + m;\n    T val = grad_scores[out_idx];\n    for (int o = 0; o < O; o++) {\n      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -\n              centers[b * N0 * M * O + cn * M * O + m * O + o]) *\n             grad_out[b * O * N * K + o * N * K + n * K + k];\n    }\n    grad_scores[out_idx] = val;\n  }\n}\n\n#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n#ifndef BALL_QUERY_CUDA_KERNEL_CUH\n#define BALL_QUERY_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,\n                                               float min_radius,\n                                               float max_radius, int nsample,\n                                               const T* new_xyz, const T* xyz,\n                                               int* idx) {\n  // new_xyz: (B, M, 3)\n  // xyz: (B, N, 3)\n  // output:\n  //      idx: (B, M, nsample)\n  int bs_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b) return;\n\n    new_xyz += bs_idx * m * 3 + pt_idx * 3;\n    xyz += bs_idx * n * 3;\n    idx += bs_idx * m * nsample + pt_idx * nsample;\n\n    float max_radius2 = max_radius * max_radius;\n    float min_radius2 = min_radius * min_radius;\n    T new_x = new_xyz[0];\n    T new_y = new_xyz[1];\n    T new_z = new_xyz[2];\n\n    int cnt = 0;\n    for (int k = 0; k < n; ++k) {\n      T x = xyz[k * 3 + 0];\n      T y = xyz[k * 3 + 1];\n      T z = xyz[k * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {\n        if (cnt == 0) {\n          for (int l = 0; l < nsample; ++l) {\n            idx[l] = k;\n          }\n        }\n        idx[cnt] = k;\n        ++cnt;\n        if (cnt >= nsample) break;\n      }\n    }\n  }\n}\n\n#endif  // BALL_QUERY_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH\n#define BBOX_OVERLAPS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,\n                                          T& y1, T& x2, T& y2) {\n  x1 = bbox[base];\n  y1 = bbox[base + 1];\n  x2 = bbox[base + 2];\n  y2 = bbox[base + 3];\n}\n\ntemplate <>\n__device__ __forceinline__ void load_bbox<float>(const float* bbox,\n                                                 const int base, float& x1,\n                                                 float& y1, float& x2,\n                                                 float& y2) {\n  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];\n  x1 = bbox_offset.x;\n  y1 = bbox_offset.y;\n  x2 = bbox_offset.z;\n  y2 = bbox_offset.w;\n}\n\ntemplate <typename T>\n__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,\n                                          T* ious, const int num_bbox1,\n                                          const int num_bbox2, const int mode,\n                                          const bool aligned,\n                                          const int offset) {\n  if (aligned) {\n    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {\n      const int b1 = index;\n      const int b2 = index;\n\n      const int base1 = b1 << 2;  // b1 * 4\n      T b1_x1, b1_y1, b1_x2, b1_y2;\n      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);\n\n      const int base2 = b2 << 2;  // b2 * 4\n      T b2_x1, b2_y1, b2_x2, b2_y2;\n      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);\n\n      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);\n      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);\n      const T width = fmaxf(right - left + offset, 0.f);\n      const T height = fmaxf(bottom - top + offset, 0.f);\n      const T interS = width * height;\n\n      const T baseS =\n          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));\n      ious[index] = interS / baseS;\n    }\n  } else {\n    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {\n      const int b1 = index / num_bbox2;\n      const int b2 = index % num_bbox2;\n\n      const int base1 = b1 << 2;  // b1 * 4\n      T b1_x1, b1_y1, b1_x2, b1_y2;\n      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);\n\n      const int base2 = b2 << 2;  // b2 * 4\n      T b2_x1, b2_y1, b2_x2, b2_y2;\n      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);\n\n      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);\n      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);\n      const T width = fmaxf(right - left + offset, 0.f);\n      const T height = fmaxf(bottom - top + offset, 0.f);\n      const T interS = width * height;\n\n      const T baseS =\n          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));\n      ious[index] = interS / baseS;\n    }\n  }\n}\n\n#if __CUDA_ARCH__ >= 530\n__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,\n                                              const __half x2, const __half y2,\n                                              const __half offset) {\n  const __half half_w = __hadd(__hsub(x2, x1), offset);\n  const __half half_h = __hadd(__hsub(y2, y1), offset);\n  return __hmul(half_w, half_h);\n}\n\n__device__ __forceinline__ __half __half_max(const __half a, const __half b) {\n  return __hge(a, b) ? a : b;\n}\n\n__device__ __forceinline__ __half __half_min(const __half a, const __half b) {\n  return __hle(a, b) ? a : b;\n}\n\n// fp16 won't provide much increase when aligned==true. It is useful when\n// aligned==false, which would give you ~40% bonus.\n__device__ void bbox_overlaps_cuda_kernel_half(\n    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,\n    const int num_bbox2, const int mode, const bool aligned, const int offset) {\n  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;\n  const __half h_offset = __int2half_rn(offset);\n  CUDA_1D_KERNEL_LOOP(index, num_output) {\n    const int b1 = aligned ? index : index / num_bbox2;\n    const int b2 = aligned ? index : index % num_bbox2;\n\n    const int base1 = b1 << 2;\n    __half b1_x1, b1_y1, b1_x2, b1_y2;\n    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);\n\n    const int base2 = b2 << 2;\n    __half b2_x1, b2_y1, b2_x2, b2_y2;\n    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);\n\n    const __half left = __half_max(b1_x1, b2_x1),\n                 right = __half_min(b1_x2, b2_x2);\n    const __half top = __half_max(b1_y1, b2_y1),\n                 bottom = __half_min(b1_y2, b2_y2);\n    const __half width =\n        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));\n    const __half height =\n        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));\n    const __half interS = __hmul(width, height);\n\n    const __half baseS = __half_max(\n        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,\n        h_offset);\n    ious[index] = __hdiv(interS, baseS);\n  }\n}\n#endif  // __CUDA_ARCH__ >= 530\n\n#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu\n#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH\n#define BEZIER_ALIGN_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\ntemplate <typename T>\n__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,\n                          const T u) {\n  return ((1. - u) * (1. - u) * (1. - u) * p0 +\n          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +\n          u * u * u * p3);\n}\n\ntemplate <typename T>\n__global__ void bezier_align_forward_cuda_kernel(\n    const int nthreads,\n    const T *bottom_data,  // inputs\n    const T *bottom_rois,  // bottom rois contains the bezier curve\n    T *top_data,           // outputs\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int sampling_ratio, bool aligned, const int channels,\n    const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    // beziers have size Nx(1+8*2) = Nx17\n    const T *offset_bottom_rois = bottom_rois + n * 17;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n\n    // TODO: avoid this by using parallel annotation, for good\n    T p0_x = offset_bottom_rois[1] * spatial_scale;\n    T p0_y = offset_bottom_rois[2] * spatial_scale;\n    T p1_x = offset_bottom_rois[3] * spatial_scale;\n    T p1_y = offset_bottom_rois[4] * spatial_scale;\n    T p2_x = offset_bottom_rois[5] * spatial_scale;\n    T p2_y = offset_bottom_rois[6] * spatial_scale;\n    T p3_x = offset_bottom_rois[7] * spatial_scale;\n    T p3_y = offset_bottom_rois[8] * spatial_scale;\n    T p4_x = offset_bottom_rois[15] * spatial_scale;\n    T p4_y = offset_bottom_rois[16] * spatial_scale;\n    T p5_x = offset_bottom_rois[13] * spatial_scale;\n    T p5_y = offset_bottom_rois[14] * spatial_scale;\n    T p6_x = offset_bottom_rois[11] * spatial_scale;\n    T p6_y = offset_bottom_rois[12] * spatial_scale;\n    T p7_x = offset_bottom_rois[9] * spatial_scale;\n    T p7_y = offset_bottom_rois[10] * spatial_scale;\n\n    // compute the coords\n    const T u = pw / static_cast<T>(pooled_width);\n    const T v = ph / static_cast<T>(pooled_height);\n    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n    const T x_center = x1 * v + x0 * (1. - v) - offset;\n    const T y_center = y1 * v + y0 * (1. - v) - offset;\n\n    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));\n    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    // When the grid is empty, output zeros == 0/1, instead of NaN.\n    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    T output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1\n    {\n      const T y = y_center - (T)0.5 * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = x_center - (T)0.5 * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,\n                                     index);\n        output_val += val;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename T>\n__global__ void bezier_align_backward_cuda_kernel(\n    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int sampling_ratio, bool aligned, const int channels,\n    const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    // beziers have size Nx(1+8*2) = Nx17\n    const T *offset_bottom_rois = bottom_rois + n * 17;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T p0_x = offset_bottom_rois[1] * spatial_scale;\n    T p0_y = offset_bottom_rois[2] * spatial_scale;\n    T p1_x = offset_bottom_rois[3] * spatial_scale;\n    T p1_y = offset_bottom_rois[4] * spatial_scale;\n    T p2_x = offset_bottom_rois[5] * spatial_scale;\n    T p2_y = offset_bottom_rois[6] * spatial_scale;\n    T p3_x = offset_bottom_rois[7] * spatial_scale;\n    T p3_y = offset_bottom_rois[8] * spatial_scale;\n    T p4_x = offset_bottom_rois[15] * spatial_scale;\n    T p4_y = offset_bottom_rois[16] * spatial_scale;\n    T p5_x = offset_bottom_rois[13] * spatial_scale;\n    T p5_y = offset_bottom_rois[14] * spatial_scale;\n    T p6_x = offset_bottom_rois[11] * spatial_scale;\n    T p6_y = offset_bottom_rois[12] * spatial_scale;\n    T p7_x = offset_bottom_rois[9] * spatial_scale;\n    T p7_y = offset_bottom_rois[10] * spatial_scale;\n\n    // compute the coords\n    const T u = pw / static_cast<T>(pooled_width);\n    const T v = ph / static_cast<T>(pooled_height);\n    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n    const T x_center = x1 * v + x0 * (1. - v) - offset;\n    const T y_center = y1 * v + y0 * (1. - v) - offset;\n\n    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));\n    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    T *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels + c) * height * width;\n\n    int top_offset = (n * channels + c) * pooled_height * pooled_width;\n    const T *offset_top_diff = top_diff + top_offset;\n    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1\n    {\n      const T y = y_center - (T)0.5 * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = x_center - (T)0.5 * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        T g1 = top_diff_this_bin * w1 / count;\n        T g2 = top_diff_this_bin * w2 / count;\n        T g3 = top_diff_this_bin * w3 / count;\n        T g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low,\n                    static_cast<T>(g1));\n          atomicAdd(offset_bottom_diff + y_low * width + x_high,\n                    static_cast<T>(g2));\n          atomicAdd(offset_bottom_diff + y_high * width + x_low,\n                    static_cast<T>(g3));\n          atomicAdd(offset_bottom_diff + y_high * width + x_high,\n                    static_cast<T>(g4));\n        }  // if\n      }  // ix\n    }  // iy\n  }  // CUDA_1D_KERNEL_LOOP\n}  // BezierAlignBackward\n\n#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.\n// the main difference: (1) use `argmax_idx` for fast computing of gradient\n// during the backward. (2) `wh` is directly computed by `boxes`, rather than\n// passing it as argument to forward or backward functions.\n\n#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH\n#define BORDER_ALIGN_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\nenum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };\n\n/*** Forward ***/\ntemplate <typename T>\n__global__ void border_align_forward_cuda_kernel(\n    const int nthreads, const T* input, const T* boxes, T* output,\n    int* argmax_idx, const int channels, const int box_size, const int height,\n    const int width, const int pool_size) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (batch_idx, c_idx, box_idx) is an element paralleled for computing\n    // output, and `extreme_idx` is in range [0,3]\n    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;\n    const T *offset_box, *offset_input, *offset_box_x;\n    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,\n        val, maxval;\n\n    extreme_idx = threadIdx.y;\n    // shape (N, C, box_size, 4) for output\n    batch_idx = index / channels / box_size;\n    // shape (N, box_size, 4) for boxes\n    box_idx = index % box_size + batch_idx * box_size;\n    c_idx = (index / box_size) % channels;\n\n    offset_box = boxes + box_idx * 4;\n    box_width = *(offset_box + 2) - *offset_box;\n    box_height = *(offset_box + 3) - *(offset_box + 1);\n    offset_output = output + index * 4 + extreme_idx;\n    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;\n    // shape (N, 4C, h, w) for input.\n    // [0,C) for top feature, [C,2C) for left feature,\n    // [2C,3C) for bottom feature, [3C,4C) for right feature\n    offset_input =\n        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *\n                    height * width;\n\n    // extreme_idx in [0,1] -> offset_box_x indexed at x1\n    // extreme_idx in [2,3] -> offset_box_x indexed at x2\n    offset_box_x = offset_box + extreme_idx / 2 * 2;\n\n    // (x1,y1) or (x2,y2) for (x,y)\n    x = *offset_box_x;\n    y = *(offset_box_x + 1);\n\n    switch (extreme_idx) {\n      // top\n      case BorderMode::Top:\n        stride = box_width / pool_size;\n        x_stride = stride;\n        y_stride = 0;\n        break;\n      // left\n      case BorderMode::Left:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = stride;\n        break;\n      // bottom\n      case BorderMode::Bottom:\n        stride = box_width / pool_size;\n        x_stride = -stride;\n        y_stride = 0;\n        break;\n      // right\n      case BorderMode::Right:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = -stride;\n        break;\n    }\n\n    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or\n    // (x2,y2))\n    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);\n    maxidx = 0;\n\n    // do max_pool along the border\n    for (int i = 1; i <= pool_size; i++) {\n      x += x_stride;\n      y += y_stride;\n      val = bilinear_interpolate(offset_input, height, width, y, x, index);\n      if (val > maxval) {\n        maxval = val;\n        maxidx = i;\n      }\n    }\n\n    // update output and argmax_idx\n    *offset_output = maxval;\n    *offset_argmax_idx = maxidx;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename T>\n__global__ void border_align_backward_cuda_kernel(\n    const int nthreads, const T* grad_output, const T* boxes,\n    const int* argmax_idx, T* grad_input, const int channels,\n    const int box_size, const int height, const int width,\n    const int pool_size) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (batch_idx, c_idx, box_idx) is an element paralleled for computing\n    // output, and `extreme_idx` is in range [0,3]\n    int batch_idx, c_idx, box_idx, extreme_idx;\n    const int* offset_argmax_idx;\n    const T *offset_grad_output, *offset_box, *offset_box_x;\n    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,\n        y;\n\n    extreme_idx = threadIdx.y;\n    batch_idx = index / channels / box_size;\n    box_idx = index % box_size + batch_idx * box_size;\n    c_idx = (index / box_size) % channels;\n\n    offset_box = boxes + box_idx * 4;\n    box_width = *(offset_box + 2) - *offset_box;\n    box_height = *(offset_box + 3) - *(offset_box + 1);\n    offset_grad_output = grad_output + index * 4 + extreme_idx;\n    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;\n    // [0,C) for top feature grad, [C,2C) for left feature grad,\n    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad\n    offset_grad_input = grad_input + (batch_idx * channels * 4 +\n                                      extreme_idx * channels + c_idx) *\n                                         height * width;\n\n    // extreme_idx in [0,1] -> offset_box_x indexed at x1\n    // extreme_idx in [2,3] -> offset_box_x indexed at x2\n    offset_box_x = offset_box + extreme_idx / 2 * 2;\n\n    switch (extreme_idx) {\n      // top\n      case BorderMode::Top:\n        stride = box_width / pool_size;\n        x_stride = stride;\n        y_stride = 0;\n        break;\n      // left\n      case BorderMode::Left:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = stride;\n        break;\n      // bottom\n      case BorderMode::Bottom:\n        stride = box_width / pool_size;\n        x_stride = -stride;\n        y_stride = 0;\n        break;\n      // right\n      case BorderMode::Right:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = -stride;\n        break;\n    }\n\n    // get position (x,y) which has maximum value during forward\n    x = *offset_box_x;\n    y = *(offset_box_x + 1);\n    x += x_stride * (T)(*offset_argmax_idx);\n    y += y_stride * (T)(*offset_argmax_idx);\n\n    T w1, w2, w3, w4;\n    int x_low, x_high, y_low, y_high;\n    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,\n                                  x_high, y_low, y_high, index);\n\n    // update grad_output\n    atomicAdd(offset_grad_input + y_low * width + x_low,\n              *offset_grad_output * w1);\n    atomicAdd(offset_grad_input + y_low * width + x_high,\n              *offset_grad_output * w2);\n    atomicAdd(offset_grad_input + y_high * width + x_low,\n              *offset_grad_output * w3);\n    atomicAdd(offset_grad_input + y_high * width + x_high,\n              *offset_grad_output * w4);\n  }\n}\n\n#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef BOX_IOU_QUADRI_CUDA_CUH\n#define BOX_IOU_QUADRI_CUDA_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include \"box_iou_rotated_utils.hpp\"\n\n// 2D block with 32 * 16 = 512 threads per block\nconst int BLOCK_DIM_X = 32;\nconst int BLOCK_DIM_Y = 16;\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\ntemplate <typename T>\n__global__ void box_iou_quadri_cuda_kernel(\n    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,\n    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {\n  if (aligned) {\n    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {\n      int b1 = index;\n      int b2 = index;\n\n      int base1 = b1 * 8;\n\n      float block_boxes1[8];\n      float block_boxes2[8];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n      block_boxes1[5] = dev_boxes1[base1 + 5];\n      block_boxes1[6] = dev_boxes1[base1 + 6];\n      block_boxes1[7] = dev_boxes1[base1 + 7];\n\n      int base2 = b2 * 8;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n      block_boxes2[5] = dev_boxes2[base2 + 5];\n      block_boxes2[6] = dev_boxes2[base2 + 6];\n      block_boxes2[7] = dev_boxes2[base2 + 7];\n\n      dev_ious[index] =\n          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  } else {\n    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {\n      int b1 = index / n_boxes2;\n      int b2 = index % n_boxes2;\n\n      int base1 = b1 * 8;\n\n      float block_boxes1[8];\n      float block_boxes2[8];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n      block_boxes1[5] = dev_boxes1[base1 + 5];\n      block_boxes1[6] = dev_boxes1[base1 + 6];\n      block_boxes1[7] = dev_boxes1[base1 + 7];\n\n      int base2 = b2 * 8;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n      block_boxes2[5] = dev_boxes2[base2 + 5];\n      block_boxes2[6] = dev_boxes2[base2 + 6];\n      block_boxes2[7] = dev_boxes2[base2 + 7];\n\n      dev_ious[index] =\n          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu\n#ifndef BOX_IOU_ROTATED_CUDA_CUH\n#define BOX_IOU_ROTATED_CUDA_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include \"box_iou_rotated_utils.hpp\"\n\n// 2D block with 32 * 16 = 512 threads per block\nconst int BLOCK_DIM_X = 32;\nconst int BLOCK_DIM_Y = 16;\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\ntemplate <typename T>\n__global__ void box_iou_rotated_cuda_kernel(\n    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,\n    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {\n  if (aligned) {\n    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {\n      int b1 = index;\n      int b2 = index;\n\n      int base1 = b1 * 5;\n\n      float block_boxes1[5];\n      float block_boxes2[5];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n\n      int base2 = b2 * 5;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n\n      dev_ious[index] =\n          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  } else {\n    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {\n      int b1 = index / n_boxes2;\n      int b2 = index % n_boxes2;\n\n      int base1 = b1 * 5;\n\n      float block_boxes1[5];\n      float block_boxes2[5];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n\n      int base2 = b2 * 5;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n\n      dev_ious[index] =\n          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_CUDA_KERNEL_CUH\n#define CARAFE_CUDA_KERNEL_CUH\n\n#include <ATen/cuda/DeviceUtils.cuh>\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#ifdef MMCV_WITH_HIP\n#define WARP_SIZE 64\n#else\n#define WARP_SIZE 32\n#endif\n#define THREADS_PER_PIXEL 32\n#define MAX_SHARED_MEMORY 49152\n#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144\n#define MAXIMIZE_KERNEL_SIZE true\n#define kTileDim 32\n#define kBlockRows 8\n#define FULL_MASK 0xffffffff\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\n__device__ inline int Loc2Index(const int n, const int c, const int h,\n                                const int w, const int channel_num,\n                                const int height, const int width) {\n  int index = w + (h + (c + n * channel_num) * height) * width;\n  return index;\n}\n#ifndef MMCV_WITH_HIP\n/* TODO: move this to a common place */\ntemplate <typename scalar_t>\n__device__ inline scalar_t min(scalar_t a, scalar_t b) {\n  return a < b ? a : b;\n}\n\ntemplate <typename scalar_t>\n__device__ inline scalar_t max(scalar_t a, scalar_t b) {\n  return a > b ? a : b;\n}\n#endif\ntemplate <typename scalar_t>\n__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {\n  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n    val += __shfl_down(val, offset);\n#else\n    val += __shfl_down_sync(FULL_MASK, val, offset);\n#endif\n  return val;\n}\n\ntemplate <>\n__device__ __forceinline__ phalf warpReduceSum(phalf val) {\n  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n    // Using PyTorch's macro for half support\n    __PHALF(val) += WARP_SHFL_DOWN(val, offset);\n#else\n    __PHALF(val) +=\n        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);\n#endif\n  return val;\n}\n\n// Splits the original matrix into submatrices with size 32 * 32.\n// Each block transposes one submatrix by loading it into shared memory.\n// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/\ntemplate <typename scalar_t>\n__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,\n                                           const int W, const int dh,\n                                           const int dw,\n                                           const scalar_t *__restrict__ X,\n                                           scalar_t *__restrict__ Y) {\n  __shared__ scalar_t tile[kTileDim][kTileDim + 1];\n  const int n = blockIdx.x / (dh * dw);\n  const int k = blockIdx.x % (dh * dw);\n  const int r = k / dw;\n  const int c = k % dw;\n  const int offset = n * H * W;\n  int x = c * kTileDim + threadIdx.x;\n  int y = r * kTileDim + threadIdx.y;\n  if (x < W) {\n    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {\n      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];\n    }\n  }\n  __syncthreads();\n  x = r * kTileDim + threadIdx.x;\n  y = c * kTileDim + threadIdx.y;\n  if (x < H) {\n    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {\n      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];\n    }\n  }\n}\ntemplate <typename scalar_t>\n__global__ void CARAFEForward(\n    const int num_kernels, const scalar_t *__restrict__ bottom_data,\n    const scalar_t *__restrict__ bottom_masks, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int down_height, const int down_width, const int height,\n    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {\n#if MAXIMIZE_KERNEL_SIZE\n  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];\n#else\n  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];\n#endif\n\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int down_pw = pw / scale_factor;\n  const int down_ph = ph / scale_factor;\n\n  const int start_w = down_pw - (kernel_size - 1) / 2;\n  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n  const int start_h = down_ph - (kernel_size - 1) / 2;\n  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {\n    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);\n    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];\n  }\n  __syncthreads();\n\n  const int channels_per_group = ceilf(channels / (float)group_size);\n#pragma unroll\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    int mask_group = c / channels_per_group;\n    scalar_t output_val = 0;\n#pragma unroll\n    for (int iy = start_h; iy < end_h; iy++) {\n#pragma unroll\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, iy, ix, c, down_height, down_width, channels);\n\n        output_val += bottom_data[feat_index] *\n                      shared_mask[mask_c * WARP_SIZE + pixel_id];\n      }\n    }\n\n    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);\n    top_data[top_index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void CARAFEBackward_Feature(\n    const int num_kernels, const scalar_t *__restrict__ top_diff,\n    const scalar_t *__restrict__ bottom_masks, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int down_height, const int down_width, const int height,\n    const int width, const int mask_channels,\n    scalar_t *__restrict__ bottom_diff) {\n#if MAXIMIZE_KERNEL_SIZE\n  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];\n#else\n  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];\n#endif\n\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n\n  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  // (n, c, ph, pw) is an element in the bottom_data\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;\n  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;\n  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;\n  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;\n  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {\n    const int mask_w = (c % kernel_size) * scale_factor;\n    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;\n    const int mask_x = start_w + mask_w;\n    const int mask_y = start_h + mask_h;\n    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {\n      shared_mask[c * WARP_SIZE + pixel_id] = 0;\n      continue;\n    }\n    const int mask_group = c / (kernel_size * kernel_size);\n    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;\n    int mask_index =\n        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);\n    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];\n  }\n  __syncthreads();\n  const int channels_per_group = ceilf(channels / (float)group_size);\n#pragma unroll\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    int mask_group = c / channels_per_group;\n    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);\n    scalar_t output_val = 0;\n#pragma unroll\n    for (int iy = start_h; iy < end_h; iy += scale_factor) {\n#pragma unroll\n      for (int ix = start_w; ix < end_w; ix += scale_factor) {\n        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {\n          continue;\n        }\n        int mask_iy =\n            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;\n        int mask_ix =\n            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);\n        output_val +=\n            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];\n      }\n    }\n    bottom_diff[top_index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void FeatureSum(const int num_kernels,\n                           const scalar_t *__restrict__ input_data,\n                           const int scale_factor, const int channels,\n                           const int height, const int width,\n                           scalar_t *__restrict__ output_data) {\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    scalar_t output_val = 0;\n    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {\n      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {\n        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,\n                                 width * scale_factor, channels);\n        output_val += input_data[input_id];\n      }\n    }\n    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);\n    output_data[output_id] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void CARAFEBackward_Mask(const int num_kernels,\n                                    const scalar_t *__restrict__ top_diff,\n                                    const scalar_t *__restrict__ bottom_data,\n                                    const int kernel_size, const int group_size,\n                                    const int scale_factor, const int channels,\n                                    const int down_height, const int down_width,\n                                    const int height, const int width,\n                                    const int mask_channels,\n                                    scalar_t *__restrict__ mask_diff) {\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n\n  const int lane_id = index % WARP_SIZE;\n  index = index / WARP_SIZE;\n  const int mask_c = index % mask_channels;\n  // (n, c, ph, pw) is an element in the bottom_data\n  index = index / mask_channels;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int down_pw = pw / scale_factor;\n  const int down_ph = ph / scale_factor;\n\n  const int mask_group = mask_c / (kernel_size * kernel_size);\n  const int mask_loc = mask_c % (kernel_size * kernel_size);\n\n  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;\n  const int offset_y =\n      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;\n\n  const int down_x = down_pw + offset_x;\n  const int down_y = down_ph + offset_y;\n\n  scalar_t output_val = 0;\n\n  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&\n      down_x <= down_width - 1) {\n    const int channels_per_mask = ceilf(channels / (float)group_size);\n    const int start = channels_per_mask * mask_group;\n    const int end = min(channels_per_mask * (mask_group + 1), channels);\n    for (int c = start + lane_id; c < end; c += WARP_SIZE) {\n      int bottom_id =\n          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);\n      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);\n      output_val += top_diff[top_id] * bottom_data[bottom_id];\n    }\n  }\n#ifdef MMCV_WITH_HIP\n  __syncthreads();\n#else\n  __syncwarp();\n#endif\n  output_val = warpReduceSum(output_val);\n  if (lane_id == 0) {\n    const int mask_id =\n        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);\n    mask_diff[mask_id] = output_val;\n  }\n}\n\n#endif  // CARAFE_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH\n#define CARAFE_NAIVE_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n__device__ inline int Loc2Index(const int n, const int c, const int h,\n                                const int w, const int channel_num,\n                                const int height, const int width) {\n  int index = w + (h + (c + n * channel_num) * height) * width;\n  return index;\n}\n\ntemplate <typename scalar_t>\n__global__ void carafe_naive_forward_cuda_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the bottom_data\n    int pw = index % width;\n    int ph = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int mask_channels = kernel_size * kernel_size * group_size;\n    int mask_group = c / (channels / group_size);\n\n    int down_pw = pw / scale_factor;\n    int down_ph = ph / scale_factor;\n    int down_width = width / scale_factor;\n    int down_height = height / scale_factor;\n    int start_w = down_pw - (kernel_size - 1) / 2;\n    int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n    int start_h = down_ph - (kernel_size - 1) / 2;\n    int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n\n    scalar_t output_val = 0;\n    for (int iy = start_h; iy < end_h; iy++) {\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, c, iy, ix, channels, down_height, down_width);\n        int mask_index =\n            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);\n        output_val += bottom_data[feat_index] * bottom_masks[mask_index];\n      }\n    }\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void carafe_naive_backward_cuda_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,\n    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,\n    const int kernel_size, const int group_size, const int scale_factor,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the bottom_data\n    int pw = index % width;\n    int ph = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int mask_channels = kernel_size * kernel_size * group_size;\n    int mask_group = c / (channels / group_size);\n\n    int down_pw = pw / scale_factor;\n    int down_ph = ph / scale_factor;\n    int down_width = width / scale_factor;\n    int down_height = height / scale_factor;\n    int start_w = down_pw - (kernel_size - 1) / 2;\n    int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n    int start_h = down_ph - (kernel_size - 1) / 2;\n    int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n\n    for (int iy = start_h; iy < end_h; iy++) {\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, c, iy, ix, channels, down_height, down_width);\n        int mask_index =\n            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);\n        atomicAdd(bottom_diff + feat_index,\n                  bottom_masks[mask_index] * top_diff[index]);\n        atomicAdd(mask_diff + mask_index,\n                  bottom_data[feat_index] * top_diff[index]);\n      }\n    }\n  }\n}\n\n#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu\n#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH\n#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144\n\ntemplate <typename scalar_t>\n__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,\n                                                     const scalar_t* xyz, int m,\n                                                     const scalar_t* xyz2,\n                                                     scalar_t* result,\n                                                     int* result_i) {\n  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];\n  for (int i = blockIdx.x; i < b; i += gridDim.x) {\n    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {\n      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;\n      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {\n        buf[j] = xyz2[(i * m + k2) * 2 + j];\n      }\n      __syncthreads();\n      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {\n        scalar_t x1 = xyz[(i * n + j) * 2 + 0];\n        scalar_t y1 = xyz[(i * n + j) * 2 + 1];\n        int best_i = 0;\n        scalar_t best = 1e10;\n        int end_ka = end_k & (~3);\n        if (end_ka == THREADS_PER_BLOCK) {\n          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {\n#pragma unroll\n            for (int j = 0; j < 4; ++j) {\n              scalar_t x2 = buf[(k + j) * 2] - x1;\n              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;\n              scalar_t d = x2 * x2 + y2 * y2;\n              if (d < best) {\n                best = d;\n                best_i = k + k2 + j;\n              }\n            }\n          }\n        } else {\n          for (int k = 0; k < end_ka; k += 4) {\n#pragma unroll\n            for (int j = 0; j < 4; ++j) {\n              scalar_t x2 = buf[(k + j) * 2] - x1;\n              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;\n              scalar_t d = x2 * x2 + y2 * y2;\n              if (d < best) {\n                best = d;\n                best_i = k + k2 + j;\n              }\n            }\n          }\n        }\n        for (int k = end_ka; k < end_k; k++) {\n          scalar_t x2 = buf[k * 2 + 0] - x1;\n          scalar_t y2 = buf[k * 2 + 1] - y1;\n          scalar_t d = x2 * x2 + y2 * y2;\n          if (k == 0 || d < best) {\n            best = d;\n            best_i = k + k2;\n          }\n        }\n        if (k2 == 0 || result[(i * n + j)] > best) {\n          result[(i * n + j)] = best;\n          result_i[(i * n + j)] = best_i;\n        }\n      }\n      __syncthreads();\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void chamfer_distance_backward_cuda_kernel(\n    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,\n    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,\n    scalar_t* grad_xyz2) {\n  for (int i = blockIdx.x; i < b; i += gridDim.x) {\n    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {\n      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];\n      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];\n      int j2 = idx1[i * n + j];\n      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];\n      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];\n      scalar_t g = grad_dist1[i * n + j] * 2;\n      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));\n      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));\n      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));\n      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));\n    }\n  }\n}\n#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp",
    "content": "#ifndef COMMON_CUDA_HELPER\n#define COMMON_CUDA_HELPER\n\n#include <cuda.h>\n\n#define CUDA_1D_KERNEL_LOOP(i, n)                              \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \\\n       i += blockDim.x * gridDim.x)\n\n#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \\\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \\\n       i += blockDim.x * gridDim.x)                                 \\\n    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \\\n         j += blockDim.y * gridDim.y)\n\n#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \\\n  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \\\n    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)\n\n#define THREADS_PER_BLOCK 512\n\ninline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {\n  int optimal_block_num = (N + num_threads - 1) / num_threads;\n  int max_block_num = 4096;\n  return min(optimal_block_num, max_block_num);\n}\n\ntemplate <typename T>\n__device__ T bilinear_interpolate(const T* input, const int height,\n                                  const int width, T y, T x,\n                                  const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  int y_low = (int)y;\n  int x_low = (int)x;\n  int y_high;\n  int x_high;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n  // do bilinear interpolation\n  T v1 = input[y_low * width + x_low];\n  T v2 = input[y_low * width + x_high];\n  T v3 = input[y_high * width + x_low];\n  T v4 = input[y_high * width + x_high];\n  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  return val;\n}\n\ntemplate <typename T>\n__device__ void bilinear_interpolate_gradient(\n    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,\n    int& x_low, int& x_high, int& y_low, int& y_high,\n    const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  // reference in forward\n  // T v1 = input[y_low * width + x_low];\n  // T v2 = input[y_low * width + x_high];\n  // T v3 = input[y_high * width + x_low];\n  // T v4 = input[y_high * width + x_high];\n  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  return;\n}\n#endif  // COMMON_CUDA_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CONVEX_IOU_CUDA_KERNEL_CUH\n#define CONVEX_IOU_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#define MAXN 100\n#define NMAX 512\n__device__ const double EPS = 1E-8;\n\n__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }\n\nstruct Point {\n  double x, y;\n  __device__ Point() {}\n  __device__ Point(double x, double y) : x(x), y(y) {}\n};\n\n__device__ inline bool point_same(Point& a, Point& b) {\n  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;\n}\n\n__device__ inline void swap1(Point* a, Point* b) {\n  Point temp;\n  temp.x = a->x;\n  temp.y = a->y;\n\n  a->x = b->x;\n  a->y = b->y;\n\n  b->x = temp.x;\n  b->y = temp.y;\n}\n\n__device__ inline void reverse1(Point* a, const int n) {\n  for (int i = 0; i < (n - 1) / 2.0; i++) {\n    Point* j = &(a[i]);\n    Point* k = &(a[n - 1 - i]);\n    swap1(j, k);\n  }\n}\n\n__device__ inline double cross(Point o, Point a, Point b) {\n  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);\n}\n\n__device__ inline double dis(Point a, Point b) {\n  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);\n}\n__device__ inline double area(Point* ps, int n) {\n  ps[n] = ps[0];\n  double res = 0;\n  for (int i = 0; i < n; i++) {\n    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;\n  }\n  return res / 2.0;\n}\n__device__ inline double polygon_area_grad(Point* ps, int n,\n                                           int* polygon_to_pred_index,\n                                           int n_pred, double* grad_C) {\n  ps[n] = ps[0];\n  double partion_grad[4 * 30 + 2];\n  double res = 0;\n  for (int i = 0; i < n; i++) {\n    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;\n    partion_grad[i * 4 + 2] = ps[i + 1].y;\n    partion_grad[i * 4 + 3] = -ps[i + 1].x;\n    if (i != n - 1) {\n      partion_grad[i * 4 + 4] = -ps[i].y;\n      partion_grad[i * 4 + 5] = ps[i].x;\n    } else {\n      partion_grad[0] = -ps[i].y;\n      partion_grad[1] = ps[i].x;\n    }\n  }\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n_pred; j++) {\n      if (i == polygon_to_pred_index[j]) {\n        grad_C[2 * polygon_to_pred_index[j + n_pred]] =\n            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;\n        break;\n      }\n    }\n    for (int j = 0; j < n_pred; j++) {\n      if (i == polygon_to_pred_index[j]) {\n        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =\n            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;\n        break;\n      }\n    }\n  }\n\n  return res / 2.0;\n}\n\n__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,\n                                double* cut_grad, int m, int n, int i) {\n  double s1, s2;\n  double s2_s1_2;\n  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;\n  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;\n  s1 = cross(a, b, c);\n  s2 = cross(a, b, d);\n\n  ds1_dxc = -(b.y - a.y);\n  ds1_dyc = b.x - a.x;\n  ds2_dxd = ds1_dxc;\n  ds2_dyd = ds1_dyc;\n  s2_s1_2 = (s2 - s1) * (s2 - s1);\n\n  if (sig(s1) == 0 && sig(s2) == 0) return 2;\n  if (sig(s2 - s1) == 0) return 0;\n\n  dxp_dxc =\n      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /\n      (s2_s1_2);\n  dxp_dyc =\n      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /\n      (s2_s1_2);\n  dxp_dxd =\n      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /\n      (s2_s1_2);\n  dxp_dyd =\n      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /\n      (s2_s1_2);\n\n  dyp_dxc =\n      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /\n      (s2_s1_2);\n  dyp_dyc =\n      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /\n      (s2_s1_2);\n  dyp_dxd =\n      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /\n      (s2_s1_2);\n  dyp_dyd =\n      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /\n      (s2_s1_2);\n\n  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);\n  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);\n  if (i == n - 1) {\n    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;\n    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;\n    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;\n    cut_grad[4 * n * m + 1] = dyp_dxd;\n    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;\n    cut_grad[4 * n * m + 3] = dyp_dyd;\n  } else {\n    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;\n    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;\n    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;\n  }\n\n  return 1;\n}\n__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,\n                                   double* cut_grad) {\n  Point pp[MAXN];\n  double ccur_grad[MAXN] = {};\n  int m = 0;\n  p[n] = p[0];\n  int k = n;\n  for (int i = 0; i < n; i++) {\n    if (sig(cross(a, b, p[i])) > 0) {\n      pp[m] = p[i];\n      ccur_grad[4 * n * m + 4 * i] = 1.0;\n      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;\n      m++;\n    }\n    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {\n      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);\n      m++;\n    }\n  }\n\n  n = 0;\n  for (int i = 0; i < m; i++) {\n    if (!i || !(point_same(pp[i], pp[i - 1]))) {\n      p[n] = pp[i];\n      for (int j = 0; j < 4 * k; j++) {\n        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];\n      }\n      n++;\n    }\n  }\n\n  while (n > 1 && point_same(p[n - 1], p[0])) n--;\n}\n\n__device__ inline double intersectArea(Point a, Point b, Point c, Point d,\n                                       double* grad_AB, int order,\n                                       int convex_n) {\n  Point o(0, 0);\n  int res_flag = 0;\n  int s1 = sig(cross(o, a, b));\n  int s2 = sig(cross(o, c, d));\n  if (s1 == 0 || s2 == 0) return 0.0;\n  if (s1 == -1) {\n    Point* i = &a;\n    Point* j = &b;\n    swap1(i, j);\n    res_flag = 1;\n  }\n  if (s2 == -1) {\n    Point* i = &c;\n    Point* j = &d;\n    swap1(i, j);\n  }\n  Point p[10] = {o, a, b};\n  int n = 3, n0 = 3, n1, n2, n3;\n  double cut_grad1[MAXN] = {};\n  double cut_grad2[MAXN] = {};\n  double cut_grad3[MAXN] = {};\n  double p1_p_grad[10][10] = {};\n  double p2_p1_grad[10][10] = {};\n  double p3_p2_grad[10][10] = {};\n\n  double p3_p1_grad[10][10] = {};\n  double p3_p_grad[10][10] = {};\n\n  // 1\n  polygon_cut(p, n, o, c, cut_grad1);\n  n1 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n0; j++) {\n      if (!(j % 2)) {\n        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];\n      } else {\n        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];\n      }\n    }\n  }\n\n  // 2\n  polygon_cut(p, n, c, d, cut_grad2);\n  n2 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n1; j++) {\n      if (!(j % 2)) {\n        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];\n      } else {\n        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];\n      }\n    }\n  }\n  // 3\n  polygon_cut(p, n, d, o, cut_grad3);\n  n3 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n2; j++) {\n      if (!(j % 2)) {\n        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];\n      } else {\n        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];\n      }\n    }\n  }\n\n  // mul\n  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)\n  for (int i = 0; i < 2 * n3; i++) {\n    for (int j = 0; j < 2 * n1; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n2; m++) {\n        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];\n      }\n      p3_p1_grad[i][j] = sum;\n    }\n  }\n\n  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)\n  for (int i = 0; i < 2 * n3; i++) {\n    for (int j = 0; j < 2 * n0; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n1; m++) {\n        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];\n      }\n      p3_p_grad[i][j] = sum;\n    }\n  }\n\n  // calculate S_grad\n  int polygon_index_box_index[20];\n  double grad_polygon[20];\n  double S_grad[6];\n\n  for (int i = 0; i < n3; i++) {\n    polygon_index_box_index[i] = i;\n    polygon_index_box_index[i + n3] = i;\n  }\n\n  double res =\n      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);\n\n  if (s1 * s2 == -1) {\n    for (int j = 0; j < 2 * 3; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n3; m++) {\n        sum = sum - grad_polygon[m] * p3_p_grad[m][j];\n      }\n      S_grad[j] = sum;\n    }\n\n    if (order != convex_n - 1) {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[2 * order + 2] += S_grad[2];\n        grad_AB[2 * order + 3] += S_grad[3];\n\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[2 * order + 2] += S_grad[4];\n        grad_AB[2 * order + 3] += S_grad[5];\n      }\n    } else {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[0] += S_grad[2];\n        grad_AB[1] += S_grad[3];\n\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[0] += S_grad[4];\n        grad_AB[1] += S_grad[5];\n      }\n    }\n    res = -res;\n  } else {\n    for (int j = 0; j < 2 * 3; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n3; m++) {\n        sum = sum + grad_polygon[m] * p3_p_grad[m][j];\n      }\n      S_grad[j] = sum;\n    }\n\n    if (order != convex_n - 1) {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[2 * order + 2] += S_grad[2];\n        grad_AB[2 * order + 3] += S_grad[3];\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[2 * order + 2] += S_grad[4];\n        grad_AB[2 * order + 3] += S_grad[5];\n      }\n    } else {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[0] += S_grad[2];\n        grad_AB[1] += S_grad[3];\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[0] += S_grad[4];\n        grad_AB[1] += S_grad[5];\n      }\n    }\n  }\n  return res;\n}\n\n__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,\n                                        double* grad_AB) {\n  if (area(ps1, n1) < 0) reverse1(ps1, n1);\n  if (area(ps2, n2) < 0) reverse1(ps2, n2);\n  ps1[n1] = ps1[0];\n  ps2[n2] = ps2[0];\n  double res = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n2; j++) {\n      res +=\n          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);\n    }\n  }\n  return res;\n}\n\n__device__ inline void Jarvis(Point* in_poly, int& n_poly) {\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[NMAX] = {}, top1, top2;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point* j = &(in_poly[0]);\n      Point* k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n}\n\n__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,\n                                           int n2, double* grad_C) {\n  Point polygon[MAXN];\n  int n = n1 + n2, n_poly = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n - n1; j++) {\n      if (point_same(ps1[i], ps2[j])) {\n        for (int k = j; k < n - n1 - 1; k++) {\n          ps2[k] = ps2[k + 1];\n        }\n        n2--;\n        break;\n      }\n    }\n  }\n  n_poly = n1 + n2;\n  for (int i = 0; i < n_poly; i++) {\n    if (i < n1) {\n      polygon[i] = ps1[i];\n    } else {\n      polygon[i] = ps2[i - n1];\n    }\n  }\n\n  Jarvis(polygon, n_poly);\n\n  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,\n                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};\n  int n_pred = 0;\n  for (int i = 0; i < n_poly; i++) {\n    for (int j = 0; j < n1; j++) {\n      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {\n        polygon_to_pred_index[n_pred] = i;\n        polygon_to_pred_index[n_pred + n1] = j;\n        n_pred += 1;\n        break;\n      }\n    }\n  }\n  if (n_pred == 0) {\n    double polygon_area = fabs(area(polygon, n_poly));\n    for (int i = 0; i < 18; i++) {\n      grad_C[i] = 0.0;\n    }\n    return polygon_area;\n  } else {\n    double polygon_area =\n        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);\n    if (polygon_area < 0) {\n      for (int i = 0; i < 18; i++) {\n        grad_C[i] = -grad_C[i];\n      }\n    }\n    return fabs(polygon_area);\n  }\n}\n\n// convex_find and get the polygon_index_box_index\n__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,\n                                        int* points_to_convex_ind) {\n  int n_input = n_poly;\n  Point input_poly[20];\n  for (int i = 0; i < n_input; i++) {\n    input_poly[i].x = in_poly[i].x;\n    input_poly[i].y = in_poly[i].y;\n  }\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[20], top1, top2;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point* j = &(in_poly[0]);\n      Point* k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n  for (int i = 0; i <= top1; i++) {\n    right_point[i] = in_poly[Stack[i]];\n  }\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n\n  for (int i = top2 - 1; i >= 0; i--) {\n    left_point[i] = in_poly[Stack[i]];\n  }\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n  for (int i = 0; i < n_poly; i++) {\n    for (int j = 0; j < n_input; j++) {\n      if (point_same(in_poly[i], input_poly[j])) {\n        points_to_convex_ind[i] = j;\n        break;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__device__ inline float devrIoU(T const* const p, T const* const q,\n                                T* point_grad, const int idx) {\n  Point ps1[MAXN], ps2[MAXN];\n\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = (double)p[i * 2];\n    convex[i].y = (double)p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};\n  Jarvis_and_index(convex, n_convex, points_to_convex_ind);\n\n  int n1 = n_convex;\n  int n2 = 4;\n\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = (double)convex[i].x;\n    ps1[i].y = (double)convex[i].y;\n  }\n\n  for (int i = 0; i < n2; i++) {\n    ps2[i].x = (double)q[i * 2];\n    ps2[i].y = (double)q[i * 2 + 1];\n  }\n\n  int polygon_index_box_index[18];\n  for (int i = 0; i < n1; i++) {\n    polygon_index_box_index[i] = i;\n    polygon_index_box_index[i + n1] = i;\n  }\n\n  double grad_A[18] = {};\n  double grad_AB[18] = {};\n  double grad_C[18] = {};\n\n  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);\n  double S_pred =\n      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);\n  if (S_pred < 0) {\n    for (int i = 0; i < n_convex * 2; i++) {\n      grad_A[i] = -grad_A[i];\n    }\n  }\n  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;\n\n  double iou = inter_area / union_area;\n  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);\n\n  //    printf(\"%d:live\\n\", idx);\n  double rot_giou = iou - (polygon_area - union_area) / polygon_area;\n\n  float grad_point_temp[18] = {};\n\n  for (int i = 0; i < n_convex; i++) {\n    int grad_point = points_to_convex_ind[i];\n    grad_point_temp[2 * grad_point] =\n        (float)((union_area + inter_area) / (union_area * union_area) *\n                    grad_AB[2 * i] -\n                iou / union_area * grad_A[2 * i] -\n                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -\n                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);\n    grad_point_temp[2 * grad_point + 1] =\n        (float)((union_area + inter_area) / (union_area * union_area) *\n                    grad_AB[2 * i + 1] -\n                iou / union_area * grad_A[2 * i + 1] -\n                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -\n                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);\n  }\n\n  for (int i = 0; i < 9; i++) {\n    point_grad[2 * i] = grad_point_temp[2 * i];\n    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];\n  }\n  return (float)rot_giou;\n}\n\ntemplate <typename T>\n__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,\n                                        const int gt_n_boxes, const T* ex_boxes,\n                                        const T* gt_boxes, T* point_grad) {\n  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T* cur_box = ex_boxes + index * 18;\n    const T* cur_gt_box = gt_boxes + index * 8;\n    T* cur_grad = point_grad + index * 19;\n    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);\n    cur_grad[18] = giou;\n  }\n}\n\n__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {\n  double s1, s2;\n  s1 = cross(a, b, c);\n  s2 = cross(a, b, d);\n  if (sig(s1) == 0 && sig(s2) == 0) return 2;\n  if (sig(s2 - s1) == 0) return 0;\n  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);\n  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);\n  return 1;\n}\n\n__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {\n  Point pp[MAXN];\n  int m = 0;\n  p[n] = p[0];\n  for (int i = 0; i < n; i++) {\n    if (sig(cross(a, b, p[i])) > 0) {\n      pp[m] = p[i];\n      m++;\n    }\n    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {\n      lineCross(a, b, p[i], p[i + 1], pp[m]);\n      m++;\n    }\n  }\n  n = 0;\n  for (int i = 0; i < m; i++) {\n    if (!i || !(point_same(pp[i], pp[i - 1]))) {\n      p[n] = pp[i];\n      n++;\n    }\n  }\n\n  while (n > 1 && point_same(p[n - 1], p[0])) n--;\n}\n\n__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {\n  Point o(0, 0);\n  int s1 = sig(cross(o, a, b));\n  int s2 = sig(cross(o, c, d));\n  if (s1 == 0 || s2 == 0) return 0.0;\n  if (s1 == -1) {\n    Point* i = &a;\n    Point* j = &b;\n    swap1(i, j);\n  }\n  if (s2 == -1) {\n    Point* i = &c;\n    Point* j = &d;\n    swap1(i, j);\n  }\n  Point p[10] = {o, a, b};\n  int n = 3;\n\n  polygon_cut(p, n, o, c);\n  polygon_cut(p, n, c, d);\n  polygon_cut(p, n, d, o);\n  double res = area(p, n);\n  if (s1 * s2 == -1) res = -res;\n  return res;\n}\n__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,\n                                        int n2) {\n  if (area(ps1, n1) < 0) reverse1(ps1, n1);\n  if (area(ps2, n2) < 0) reverse1(ps2, n2);\n  ps1[n1] = ps1[0];\n  ps2[n2] = ps2[0];\n  double res = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n2; j++) {\n      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);\n    }\n  }\n  return res;\n}\n\ntemplate <typename T>\n__device__ inline float devrIoU(T const* const p, T const* const q) {\n  Point ps1[MAXN], ps2[MAXN];\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = (double)p[i * 2];\n    convex[i].y = (double)p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};\n  Jarvis_and_index(convex, n_convex, points_to_convex_ind);\n  int n1 = n_convex;\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = (double)convex[i].x;\n    ps1[i].y = (double)convex[i].y;\n  }\n  int n2 = 4;\n  for (int i = 0; i < n2; i++) {\n    ps2[i].x = (double)q[i * 2];\n    ps2[i].y = (double)q[i * 2 + 1];\n  }\n  double inter_area = intersectAreaO(ps1, n1, ps2, n2);\n  double S_pred = area(ps1, n1);\n  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;\n  double iou = inter_area / union_area;\n  return (float)iou;\n}\n\ntemplate <typename T>\n__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,\n                                       const int gt_n_boxes, const T* ex_boxes,\n                                       const T* gt_boxes, T* iou) {\n  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T* cur_box = ex_boxes + index * 18;\n    for (int i = 0; i < gt_n_boxes; i++) {\n      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);\n    }\n  }\n}\n#endif  // CONVEX_IOU_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/correlation_cuda.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu\n// Original licence: Under MIT License\n\n#ifndef CORRELATION_CUDA\n#define CORRELATION_CUDA\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n// Using <torch/extension.h> is recommended in the official documentation in\n// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.\n// However, we use <torch/types.h> for compatibility with CUDA 9.0\n// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.\n#include <torch/types.h>\n\n#include <iostream>\n#include <vector>\n\nusing namespace torch;\n\n#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>\n#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>\n#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)\n\n#define WARP_SIZE 32\n#define FULL_MASK 0xffffffff\n\ntemplate <typename scalar_t>\n__global__ void correlation_forward_cuda_kernel(\n    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,\n    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,\n    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,\n    int oH, int oW) {\n  const int iH = rInput1.size(1);\n  const int iW = rInput1.size(2);\n  const int C = rInput1.size(3);\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y * blockDim.y + threadIdx.y;\n  const int w = blockIdx.z * blockDim.z + threadIdx.z;\n\n  if (h >= oH || w >= oW) return;\n\n  const int thread = threadIdx.x;\n\n  const int start_i = -padH + h * dH;\n  const int start_j = -padW + w * dW;\n\n  const int patchRadH = dilation_patchH * (patchH - 1) / 2;\n  const int patchRadW = dilation_patchW * (patchW - 1) / 2;\n\n  for (int ph = 0; ph < patchH; ++ph) {\n    int ph_dilated = ph * dilation_patchH - patchRadH;\n    for (int pw = 0; pw < patchW; ++pw) {\n      int pw_dilated = pw * dilation_patchW - patchRadW;\n      scalar_t prod_sum = 0.0f;\n      for (int i = 0; i < kH; ++i) {\n        int i1 = start_i + i * dilationH;\n        int i2 = i1 + ph_dilated;\n        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {\n          for (int j = 0; j < kW; ++j) {\n            int j1 = start_j + j * dilationW;\n            int j2 = j1 + pw_dilated;\n            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {\n              for (int c = thread; c < C; c += WARP_SIZE) {\n                scalar_t v1 = rInput1[n][i1][j1][c];\n                scalar_t v2 = rInput2[n][i2][j2][c];\n                prod_sum += v1 * v2;\n              }\n            }\n          }\n        }\n      }\n      // accumulate\n      for (int offset = 16; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n        prod_sum += __shfl_down(float(prod_sum), offset);\n#else\n        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);\n#endif\n      if (thread == 0) {\n        output[n][ph][pw][h][w] = prod_sum;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_cuda_kernel_input1(\n    const TensorAcc5R grad_output, const TensorAcc4R input2,\n    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,\n    const int patchW, const int padH, const int padW, const int dilationH,\n    const int dilationW, const int dilation_patchH, const int dilation_patchW,\n    const int dH, const int dW) {\n  const int iH = input2.size(1);\n  const int iW = input2.size(2);\n  const int C = input2.size(3);\n\n  const int H = grad_output.size(3);\n  const int W = grad_output.size(4);\n\n  const int patchRadH = (patchH - 1) / 2;\n  const int patchRadW = (patchW - 1) / 2;\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y;\n  const int w = blockIdx.z;\n\n  const int h_2 = h + padH;\n  const int w_2 = w + padW;\n  const int min_h = h_2 - kH * dilationH;\n  const int min_w = w_2 - kW * dilationW;\n\n  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];\n  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);\n  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {\n    const int ph = i / patchW;\n    const int pw = i % patchW;\n    int i1 = h + dilation_patchH * (ph - patchRadH);\n    int j1 = w + dilation_patchW * (pw - patchRadW);\n\n    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n      scalar_t grad_val = 0.0f;\n      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {\n        int i2 = (h_3) / dH;\n        if (i2 * dH != h_3) continue;\n        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {\n          int j2 = (w_3) / dW;\n          if (j2 * dW != w_3) continue;\n          if (WITHIN_BOUNDS(i2, j2, H, W)) {\n            grad_val += grad_output[n][ph][pw][i2][j2];\n          }\n        }\n      }\n      grad_cache[i] = grad_val;\n    }\n  }\n  __syncthreads();\n\n  for (int c = threadIdx.x; c < C; c += blockDim.x) {\n    scalar_t grad_input_val = 0.0f;\n    for (int ph = 0; ph < patchH; ++ph) {\n      int i1 = h + dilation_patchH * (ph - patchRadH);\n      for (int pw = 0; pw < patchW; ++pw) {\n        int j1 = w + dilation_patchW * (pw - patchRadW);\n        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];\n        }\n      }\n    }\n    grad_input1[n][c][h][w] = grad_input_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_cuda_kernel_input2(\n    const TensorAcc5R grad_output, const TensorAcc4R input1,\n    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,\n    int padW, int dilationH, int dilationW, int dilation_patchH,\n    int dilation_patchW, int dH, int dW) {\n  const int iH = input1.size(1);\n  const int iW = input1.size(2);\n  const int C = input1.size(3);\n\n  const int patchRadH = (patchH - 1) / 2;\n  const int patchRadW = (patchW - 1) / 2;\n\n  const int H = grad_output.size(3);\n  const int W = grad_output.size(4);\n\n  const int dilatedKH = kH * dilationH;\n  const int dilatedKW = kW * dilationW;\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y;\n  const int w = blockIdx.z;\n\n  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];\n  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);\n  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {\n    const int ph = i / patchW;\n    const int pw = i % patchW;\n    int i1 = h - dilation_patchH * (ph - patchRadH);\n    int j1 = w - dilation_patchW * (pw - patchRadW);\n\n    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n      scalar_t grad_val = 0.0f;\n\n      const int h_2 = i1 + padH;\n      const int w_2 = j1 + padW;\n      const int min_h = h_2 - dilatedKH;\n      const int min_w = w_2 - dilatedKW;\n\n      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {\n        int i2 = (h_3) / dH;\n        if (i2 * dH != h_3) continue;\n        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {\n          int j2 = (w_3) / dW;\n          if (j2 * dW != w_3) continue;\n          if (WITHIN_BOUNDS(i2, j2, H, W)) {\n            grad_val += grad_output[n][ph][pw][i2][j2];\n          }\n        }\n      }\n      grad_cache[i] = grad_val;\n    }\n  }\n  __syncthreads();\n\n  for (int c = threadIdx.x; c < C; c += blockDim.x) {\n    scalar_t grad_input_val = 0.0f;\n    for (int ph = 0; ph < patchH; ++ph) {\n      int i1 = h - dilation_patchH * (ph - patchRadH);\n      for (int pw = 0; pw < patchW; ++pw) {\n        int j1 = w - dilation_patchW * (pw - patchRadW);\n        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];\n        }\n      }\n    }\n    grad_input2[n][c][h][w] = grad_input_val;\n  }\n}\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh",
    "content": "/*!\n ******************* BEGIN Caffe Copyright Notice and Disclaimer\n *****************\n *\n * COPYRIGHT\n *\n * All contributions by the University of California:\n * Copyright (c) 2014-2017 The Regents of the University of California (Regents)\n * All rights reserved.\n *\n * All other contributions:\n * Copyright (c) 2014-2017, the respective contributors\n * All rights reserved.\n *\n * Caffe uses a shared copyright model: each contributor holds copyright over\n * their contributions to Caffe. The project versioning records all such\n * contribution and copyright details. If a contributor wants to further mark\n * their specific copyright on a particular contribution, they should indicate\n * their copyright solely in the commit message of the change when it is\n * committed.\n *\n * LICENSE\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions are met:\n *\n * 1. Redistributions of source code must retain the above copyright notice,\n *this list of conditions and the following disclaimer.\n * 2. Redistributions in binary form must reproduce the above copyright notice,\n * this list of conditions and the following disclaimer in the documentation\n * and/or other materials provided with the distribution.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n * CONTRIBUTION AGREEMENT\n *\n * By contributing to the BVLC/caffe repository through pull-request, comment,\n * or otherwise, the contributor releases their content to the\n * license and copyright terms herein.\n *\n ***************** END Caffe Copyright Notice and Disclaimer\n *********************\n *\n * Copyright (c) 2018 Microsoft\n * Licensed under The MIT License [see LICENSE for details]\n * \\file modulated_deformable_im2col.cuh\n * \\brief Function definitions of converting an image to\n * column matrix based on kernel, padding, dilation, and offset.\n * These functions are mainly used in deformable convolution operators.\n * \\ref: https://arxiv.org/abs/1703.06211\n * \\author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng\n */\n\n// modified from\n// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu\n\n#ifndef DEFORM_CONV_CUDA_KERNEL_CUH\n#define DEFORM_CONV_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\ntemplate <typename T>\n__device__ T deformable_im2col_bilinear(const T *input, const int data_width,\n                                        const int height, const int width, T h,\n                                        T w) {\n  if (h <= -1 || height <= h || w <= -1 || width <= w) {\n    return 0;\n  }\n\n  int h_low = floorf(h);\n  int w_low = floorf(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\n__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,\n                                 const int w, const int height,\n                                 const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\n__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,\n                                   const int width, const T *im_data,\n                                   const int data_width, const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\n__global__ void deformable_im2col_gpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const int height,\n    const int width, const int kernel_h, const int kernel_w, const int pad_h,\n    const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,\n                                           h_im, w_im);\n        *data_col_ptr = val;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void deformable_col2im_gpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const int channels,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index];\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,\n                                         cur_h + dy, cur_w + dx, height, width);\n          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void deformable_col2im_coord_gpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int offset_channels, const int deformable_group, const int height_col,\n    const int width_col, T *grad_offset) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    T val = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,\n                                             data_im_ptr + cnt * height * width,\n                                             width, bp_dir);\n      val += weight * data_col_ptr[col_pos];\n      cnt += 1;\n    }\n\n    grad_offset[index] = val;\n  }\n}\n\n#endif  // DEFORM_CONV_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH\n#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void deform_roi_pool_forward_cuda_kernel(\n    const int nthreads, const T* input, const T* rois, const T* offset,\n    T* output, const int pooled_height, const int pooled_width,\n    const T spatial_scale, const int sampling_ratio, const T gamma,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;\n    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;\n    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;\n    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    // Compute roi offset\n    if (offset != NULL) {\n      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +\n                              ph * pooled_width + pw;\n      T offset_roi_w = gamma * roi_width * offset_cur_w[0];\n      T offset_roi_h =\n          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];\n      roi_start_w += offset_roi_w;\n      roi_start_h += offset_roi_h;\n    }\n\n    // We do average pooling inside a bin\n    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);\n    T output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T y = roi_start_h + ph * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = roi_start_w + pw * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n        T val = bilinear_interpolate(offset_input, height, width, y, x, index);\n        output_val += val;\n      }\n    }\n    output[index] = output_val / count;\n  }\n}\n\ntemplate <typename T>\n__global__ void deform_roi_pool_backward_cuda_kernel(\n    const int nthreads, const T* grad_output, const T* input, const T* rois,\n    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int sampling_ratio,\n    const T gamma, const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    const T* offset_input =\n        input + ((roi_batch_ind * channels + c) * height * width);\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;\n    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;\n    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;\n    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    // Compute roi offset\n    if (offset != NULL) {\n      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +\n                              ph * pooled_width + pw;\n      T offset_roi_w = gamma * roi_width * offset_cur_w[0];\n      T offset_roi_h =\n          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];\n      roi_start_w += offset_roi_w;\n      roi_start_h += offset_roi_h;\n    }\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n    const T grad_output_this_bin = grad_output[index] / count;\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T y = roi_start_h + ph * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = roi_start_w + pw * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_grad_input + y_low * width + x_low,\n                    grad_output_this_bin * w1);\n          atomicAdd(offset_grad_input + y_low * width + x_high,\n                    grad_output_this_bin * w2);\n          atomicAdd(offset_grad_input + y_high * width + x_low,\n                    grad_output_this_bin * w3);\n          atomicAdd(offset_grad_input + y_high * width + x_high,\n                    grad_output_this_bin * w4);\n          if (offset != NULL) {\n            T input_00 = offset_input[y_low * width + x_low];\n            T input_10 = offset_input[y_low * width + x_high];\n            T input_01 = offset_input[y_high * width + x_low];\n            T input_11 = offset_input[y_high * width + x_high];\n            T ogx = gamma * roi_width * grad_output_this_bin *\n                    (input_11 * (y - y_low) + input_10 * (y_high - y) +\n                     input_01 * (y_low - y) + input_00 * (y - y_high));\n            T ogy = gamma * roi_height * grad_output_this_bin *\n                    (input_11 * (x - x_low) + input_01 * (x_high - x) +\n                     input_10 * (x_low - x) + input_00 * (x - x_high));\n            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +\n                          ph * pooled_width + pw,\n                      ogx);\n            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +\n                          pooled_width * pooled_height + ph * pooled_width + pw,\n                      ogy);\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Adapted from\n// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#define MAX_NUM_VERT_IDX 9\n#define INTERSECTION_OFFSET 8\n#define EPSILON 1e-8\n\ninline int opt_n_thread(int work_size) {\n  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);\n  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);\n}\n\n/*\ncompare normalized vertices (vertices around (0,0))\nif vertex1 < vertex2 return true.\norder: minimum at x-aixs, become larger in anti-clockwise direction\n*/\n__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {\n  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)\n    return false;  // if equal, return false\n\n  if (y1 > 0 && y2 < 0) return true;\n  if (y1 < 0 && y2 > 0) return false;\n\n  float n1 = x1 * x1 + y1 * y1 + EPSILON;\n  float n2 = x2 * x2 + y2 * y2 + EPSILON;\n  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;\n\n  if (y1 > 0 && y2 > 0) {\n    if (diff > EPSILON)\n      return true;\n    else\n      return false;\n  }\n  if (y1 < 0 && y2 < 0) {\n    if (diff < EPSILON)\n      return true;\n    else\n      return false;\n  }\n  return false;\n}\n\n__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(\n    int b, int n, int m, const float *__restrict__ vertices,\n    const bool *__restrict__ mask, const int *__restrict__ num_valid,\n    int *__restrict__ idx) {\n  int batch_idx = blockIdx.x;\n  vertices += batch_idx * n * m * 2;\n  mask += batch_idx * n * m;\n  num_valid += batch_idx * n;\n  idx += batch_idx * n * MAX_NUM_VERT_IDX;\n\n  int index = threadIdx.x;  // index of polygon\n  int stride = blockDim.x;\n  for (int i = index; i < n; i += stride) {\n    int pad;  // index of arbitrary invalid intersection point (not box corner!)\n    for (int j = INTERSECTION_OFFSET; j < m; ++j) {\n      if (!mask[i * m + j]) {\n        pad = j;\n        break;\n      }\n    }\n    if (num_valid[i] < 3) {\n      // not enough vertices, take an invalid intersection point\n      // (zero padding)\n      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {\n        idx[i * MAX_NUM_VERT_IDX + j] = pad;\n      }\n    } else {\n      // sort the valid vertices\n      // note the number of valid vertices is known\n      // note: check that num_valid[i] < MAX_NUM_VERT_IDX\n      for (int j = 0; j < num_valid[i]; ++j) {\n        // initialize with a \"big\" value\n        float x_min = 1;\n        float y_min = -EPSILON;\n        int i_take = 0;\n        int i2;\n        float x2, y2;\n        if (j != 0) {\n          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];\n          x2 = vertices[i * m * 2 + i2 * 2 + 0];\n          y2 = vertices[i * m * 2 + i2 * 2 + 1];\n        }\n        for (int k = 0; k < m; ++k) {\n          float x = vertices[i * m * 2 + k * 2 + 0];\n          float y = vertices[i * m * 2 + k * 2 + 1];\n          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {\n            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {\n              x_min = x;\n              y_min = y;\n              i_take = k;\n            }\n          }\n        }\n        idx[i * MAX_NUM_VERT_IDX + j] = i_take;\n      }\n      // duplicate the first idx\n      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];\n\n      // pad zeros\n      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {\n        idx[i * MAX_NUM_VERT_IDX + j] = pad;\n      }\n\n      // for corner case: the two boxes are exactly the same.\n      // in this case, idx would have duplicate elements, which makes the\n      // shoelace formula broken because of the definition, the duplicate\n      // elements only appear in the first 8 positions (they are \"corners in\n      // box\", not \"intersection of edges\")\n      if (num_valid[i] == 8) {\n        int counter = 0;\n        for (int j = 0; j < 4; ++j) {\n          int check = idx[i * MAX_NUM_VERT_IDX + j];\n          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {\n            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;\n          }\n        }\n        if (counter == 4) {\n          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];\n          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {\n            idx[i * MAX_NUM_VERT_IDX + j] = pad;\n          }\n        }\n      }\n\n      // TODO: still might need to cover some other corner cases :(\n    }\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH\n#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,\n                         int idx1, int idx2) {\n  const float v1 = dists[idx1], v2 = dists[idx2];\n  const int i1 = dists_i[idx1], i2 = dists_i[idx2];\n  dists[idx1] = max(v1, v2);\n  dists_i[idx1] = v2 > v1 ? i2 : i1;\n}\n\ntemplate <unsigned int block_size>\n__global__ void furthest_point_sampling_forward_cuda_kernel(\n    int b, int n, int m, const float *__restrict__ dataset,\n    float *__restrict__ temp, int *__restrict__ idxs) {\n  // dataset: (B, N, 3)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  if (m <= 0) return;\n  __shared__ float dists[block_size];\n  __shared__ int dists_i[block_size];\n\n  int batch_index = blockIdx.x;\n  dataset += batch_index * n * 3;\n  temp += batch_index * n;\n  idxs += batch_index * m;\n\n  int tid = threadIdx.x;\n  const int stride = block_size;\n\n  int old = 0;\n  if (threadIdx.x == 0) idxs[0] = old;\n\n  __syncthreads();\n  for (int j = 1; j < m; j++) {\n    int besti = 0;\n    float best = -1;\n    float x1 = dataset[old * 3 + 0];\n    float y1 = dataset[old * 3 + 1];\n    float z1 = dataset[old * 3 + 2];\n    for (int k = tid; k < n; k += stride) {\n      float x2, y2, z2;\n      x2 = dataset[k * 3 + 0];\n      y2 = dataset[k * 3 + 1];\n      z2 = dataset[k * 3 + 2];\n      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);\n      // if (mag <= 1e-3)\n      // continue;\n\n      float d =\n          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);\n      float d2 = min(d, temp[k]);\n      temp[k] = d2;\n      besti = d2 > best ? k : besti;\n      best = d2 > best ? d2 : best;\n    }\n    dists[tid] = best;\n    dists_i[tid] = besti;\n    __syncthreads();\n\n#pragma unroll\n    for (int block_size_thres = 1024; block_size_thres >= 2;\n         block_size_thres >>= 1) {\n      const int tid_thres = block_size_thres / 2;\n      if (block_size >= block_size_thres && tid < tid_thres) {\n        __update(dists, dists_i, tid, tid + tid_thres);\n      }\n      __syncthreads();\n    }\n\n    old = dists_i[0];\n    if (tid == 0) idxs[j] = old;\n  }\n}\n\n// Modified from\n// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu\ntemplate <unsigned int block_size>\n__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(\n    int b, int n, int m, const float *__restrict__ dataset,\n    float *__restrict__ temp, int *__restrict__ idxs) {\n  // dataset: (B, N, N)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  if (m <= 0) return;\n  __shared__ float dists[block_size];\n  __shared__ int dists_i[block_size];\n\n  int batch_index = blockIdx.x;\n  dataset += batch_index * n * n;\n  temp += batch_index * n;\n  idxs += batch_index * m;\n\n  int tid = threadIdx.x;\n  const int stride = block_size;\n\n  int old = 0;\n  if (threadIdx.x == 0) idxs[0] = old;\n\n  __syncthreads();\n  for (int j = 1; j < m; j++) {\n    int besti = 0;\n    float best = -1;\n    // float x1 = dataset[old * 3 + 0];\n    // float y1 = dataset[old * 3 + 1];\n    // float z1 = dataset[old * 3 + 2];\n    for (int k = tid; k < n; k += stride) {\n      // float x2, y2, z2;\n      // x2 = dataset[k * 3 + 0];\n      // y2 = dataset[k * 3 + 1];\n      // z2 = dataset[k * 3 + 2];\n\n      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *\n      // (z2 - z1);\n      float d = dataset[old * n + k];\n\n      float d2 = min(d, temp[k]);\n      temp[k] = d2;\n      besti = d2 > best ? k : besti;\n      best = d2 > best ? d2 : best;\n    }\n    dists[tid] = best;\n    dists_i[tid] = besti;\n    __syncthreads();\n\n#pragma unroll\n    for (int block_size_thres = 1024; block_size_thres >= 2;\n         block_size_thres >>= 1) {\n      const int tid_thres = block_size_thres / 2;\n      if (block_size >= block_size_thres && tid < tid_thres) {\n        __update(dists, dists_i, tid, tid + tid_thres);\n      }\n      __syncthreads();\n    }\n\n    old = dists_i[0];\n    if (tid == 0) idxs[j] = old;\n  }\n}\n\n#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef GATHER_POINTS_CUDA_KERNEL_CUH\n#define GATHER_POINTS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#define TOTAL_THREADS 1024\n\ntemplate <typename T>\n__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,\n                                                  const T *points,\n                                                  const int *__restrict__ idx,\n                                                  T *out) {\n  // points: (B, C, N)\n  // idx: (B, M)\n  // output:\n  //      out: (B, C, M)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    out += bs_idx * c * m + c_idx * m + pt_idx;\n    idx += bs_idx * m + pt_idx;\n    points += bs_idx * c * n + c_idx * n;\n    out[0] = points[idx[0]];\n  }\n}\n\ntemplate <typename T>\n__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,\n                                                   const T *grad_out,\n                                                   const int *__restrict__ idx,\n                                                   T *grad_points) {\n  // grad_out: (B, C, M)\n  // idx: (B, M)\n  // output:\n  //      grad_points: (B, C, N)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    grad_out += bs_idx * c * m + c_idx * m + pt_idx;\n    idx += bs_idx * m + pt_idx;\n    grad_points += bs_idx * c * n + c_idx * n;\n\n    atomicAdd(grad_points + idx[0], grad_out[0]);\n  }\n}\n\n#endif  // GATHER_POINTS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#ifndef GROUP_POINTS_CUDA_KERNEL_CUH\n#define GROUP_POINTS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void group_points_forward_cuda_kernel(int b, int c, int n,\n                                                 int npoints, int nsample,\n                                                 const T *points,\n                                                 const int *__restrict__ idx,\n                                                 T *out) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    int pt_idx = index / nsample;\n    int sample_idx = index % nsample;\n\n    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;\n    int in_idx = bs_idx * c * n + c_idx * n + idx[0];\n    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +\n                  pt_idx * nsample + sample_idx;\n\n    out[out_idx] = points[in_idx];\n  }\n}\n\ntemplate <typename T>\n__global__ void group_points_backward_cuda_kernel(int b, int c, int n,\n                                                  int npoints, int nsample,\n                                                  const T *grad_out,\n                                                  const int *__restrict__ idx,\n                                                  T *grad_points) {\n  // grad_out: (B, C, npoints, nsample)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      grad_points: (B, C, N)\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {\n    int pt_idx = index / nsample;\n    if (bs_idx >= b || c_idx >= c) return;\n\n    int sample_idx = index % nsample;\n    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +\n                pt_idx * nsample + sample_idx;\n    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;\n\n    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);\n  }\n}\n\n#endif  // GROUP_POINTS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef IOU3D_CUDA_KERNEL_CUH\n#define IOU3D_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\nconst int THREADS_PER_BLOCK_IOU3D = 16;\nconst int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;\n__device__ const float EPS = 1e-8;\n\nstruct Point {\n  float x, y;\n  __device__ Point() {}\n  __device__ Point(double _x, double _y) { x = _x, y = _y; }\n\n  __device__ void set(float _x, float _y) {\n    x = _x;\n    y = _y;\n  }\n\n  __device__ Point operator+(const Point &b) const {\n    return Point(x + b.x, y + b.y);\n  }\n\n  __device__ Point operator-(const Point &b) const {\n    return Point(x - b.x, y - b.y);\n  }\n};\n\n__device__ inline float cross(const Point &a, const Point &b) {\n  return a.x * b.y - a.y * b.x;\n}\n\n__device__ inline float cross(const Point &p1, const Point &p2,\n                              const Point &p0) {\n  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);\n}\n\n__device__ int check_rect_cross(const Point &p1, const Point &p2,\n                                const Point &q1, const Point &q2) {\n  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&\n            min(q1.x, q2.x) <= max(p1.x, p2.x) &&\n            min(p1.y, p2.y) <= max(q1.y, q2.y) &&\n            min(q1.y, q2.y) <= max(p1.y, p2.y);\n  return ret;\n}\n\n__device__ inline int check_in_box2d(const float *box, const Point &p) {\n  // params: box (7) [x, y, z, dx, dy, dz, heading]\n  const float MARGIN = 1e-2;\n\n  float center_x = box[0], center_y = box[1];\n  // rotate the point in the opposite direction of box\n  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);\n  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);\n  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;\n\n  return (fabs(rot_x) < box[3] / 2 + MARGIN &&\n          fabs(rot_y) < box[4] / 2 + MARGIN);\n}\n\n__device__ inline int intersection(const Point &p1, const Point &p0,\n                                   const Point &q1, const Point &q0,\n                                   Point &ans_point) {\n  // fast exclusion\n  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;\n\n  // check cross standing\n  float s1 = cross(q0, p1, p0);\n  float s2 = cross(p1, q1, p0);\n  float s3 = cross(p0, q1, q0);\n  float s4 = cross(q1, p1, q0);\n\n  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;\n\n  // calculate intersection of two lines\n  float s5 = cross(q1, p1, p0);\n  if (fabs(s5 - s1) > EPS) {\n    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);\n    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);\n\n  } else {\n    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;\n    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;\n    float D = a0 * b1 - a1 * b0;\n\n    ans_point.x = (b0 * c1 - b1 * c0) / D;\n    ans_point.y = (a1 * c0 - a0 * c1) / D;\n  }\n\n  return 1;\n}\n\n__device__ inline void rotate_around_center(const Point &center,\n                                            const float angle_cos,\n                                            const float angle_sin, Point &p) {\n  float new_x =\n      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;\n  float new_y =\n      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;\n  p.set(new_x, new_y);\n}\n\n__device__ inline int point_cmp(const Point &a, const Point &b,\n                                const Point &center) {\n  return atan2(a.y - center.y, a.x - center.x) >\n         atan2(b.y - center.y, b.x - center.x);\n}\n\n__device__ inline float box_overlap(const float *box_a, const float *box_b) {\n  // params box_a: [x, y, z, dx, dy, dz, heading]\n  // params box_b: [x, y, z, dx, dy, dz, heading]\n\n  float a_angle = box_a[6], b_angle = box_b[6];\n  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,\n        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;\n  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;\n  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;\n  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;\n  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;\n\n  Point center_a(box_a[0], box_a[1]);\n  Point center_b(box_b[0], box_b[1]);\n\n  Point box_a_corners[5];\n  box_a_corners[0].set(a_x1, a_y1);\n  box_a_corners[1].set(a_x2, a_y1);\n  box_a_corners[2].set(a_x2, a_y2);\n  box_a_corners[3].set(a_x1, a_y2);\n\n  Point box_b_corners[5];\n  box_b_corners[0].set(b_x1, b_y1);\n  box_b_corners[1].set(b_x2, b_y1);\n  box_b_corners[2].set(b_x2, b_y2);\n  box_b_corners[3].set(b_x1, b_y2);\n\n  // get oriented corners\n  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);\n  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);\n\n  for (int k = 0; k < 4; k++) {\n    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);\n    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);\n  }\n\n  box_a_corners[4] = box_a_corners[0];\n  box_b_corners[4] = box_b_corners[0];\n\n  // get intersection of lines\n  Point cross_points[16];\n  Point poly_center;\n  int cnt = 0, flag = 0;\n\n  poly_center.set(0, 0);\n  for (int i = 0; i < 4; i++) {\n    for (int j = 0; j < 4; j++) {\n      flag = intersection(box_a_corners[i + 1], box_a_corners[i],\n                          box_b_corners[j + 1], box_b_corners[j],\n                          cross_points[cnt]);\n      if (flag) {\n        poly_center = poly_center + cross_points[cnt];\n        cnt++;\n      }\n    }\n  }\n\n  // check corners\n  for (int k = 0; k < 4; k++) {\n    if (check_in_box2d(box_a, box_b_corners[k])) {\n      poly_center = poly_center + box_b_corners[k];\n      cross_points[cnt] = box_b_corners[k];\n      cnt++;\n    }\n    if (check_in_box2d(box_b, box_a_corners[k])) {\n      poly_center = poly_center + box_a_corners[k];\n      cross_points[cnt] = box_a_corners[k];\n      cnt++;\n    }\n  }\n\n  poly_center.x /= cnt;\n  poly_center.y /= cnt;\n\n  // sort the points of polygon\n  Point temp;\n  for (int j = 0; j < cnt - 1; j++) {\n    for (int i = 0; i < cnt - j - 1; i++) {\n      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {\n        temp = cross_points[i];\n        cross_points[i] = cross_points[i + 1];\n        cross_points[i + 1] = temp;\n      }\n    }\n  }\n\n  // get the overlap areas\n  float area = 0;\n  for (int k = 0; k < cnt - 1; k++) {\n    area += cross(cross_points[k] - cross_points[0],\n                  cross_points[k + 1] - cross_points[0]);\n  }\n\n  return fabs(area) / 2.0;\n}\n\n__device__ inline float iou_bev(const float *box_a, const float *box_b) {\n  // params box_a: [x, y, z, dx, dy, dz, heading]\n  // params box_b: [x, y, z, dx, dy, dz, heading]\n  float sa = box_a[3] * box_a[4];\n  float sb = box_b[3] * box_b[4];\n  float s_overlap = box_overlap(box_a, box_b);\n  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);\n}\n\n__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(\n    const int num_a, const float *boxes_a, const int num_b,\n    const float *boxes_b, float *ans_overlap) {\n  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]\n  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {\n    if (a_idx >= num_a || b_idx >= num_b) {\n      return;\n    }\n\n    const float *cur_box_a = boxes_a + a_idx * 7;\n    const float *cur_box_b = boxes_b + b_idx * 7;\n    float cur_overlap = box_overlap(cur_box_a, cur_box_b);\n    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;\n  }\n}\n\n__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,\n                                                const float nms_overlap_thresh,\n                                                const float *boxes,\n                                                unsigned long long *mask) {\n  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params: mask (N, N/THREADS_PER_BLOCK_NMS)\n  const int blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    // if (row_start > col_start) return;\n\n    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n\n    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];\n\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 7 + 0] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];\n      block_boxes[threadIdx.x * 7 + 1] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];\n      block_boxes[threadIdx.x * 7 + 2] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];\n      block_boxes[threadIdx.x * 7 + 3] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];\n      block_boxes[threadIdx.x * 7 + 4] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];\n      block_boxes[threadIdx.x * 7 + 5] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];\n      block_boxes[threadIdx.x * 7 + 6] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;\n      const float *cur_box = boxes + cur_box_idx * 7;\n\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks =\n          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n      mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n__device__ inline float iou_normal(float const *const a, float const *const b) {\n  // params: a: [x, y, z, dx, dy, dz, heading]\n  // params: b: [x, y, z, dx, dy, dz, heading]\n\n  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),\n        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);\n  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),\n        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);\n  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);\n  float interS = width * height;\n  float Sa = a[3] * a[4];\n  float Sb = b[3] * b[4];\n  return interS / fmaxf(Sa + Sb - interS, EPS);\n}\n\n__global__ void iou3d_nms3d_normal_forward_cuda_kernel(\n    const int boxes_num, const float nms_overlap_thresh, const float *boxes,\n    unsigned long long *mask) {\n  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params: mask (N, N/THREADS_PER_BLOCK_NMS)\n\n  const int blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    // if (row_start > col_start) return;\n\n    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n\n    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];\n\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 7 + 0] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];\n      block_boxes[threadIdx.x * 7 + 1] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];\n      block_boxes[threadIdx.x * 7 + 2] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];\n      block_boxes[threadIdx.x * 7 + 3] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];\n      block_boxes[threadIdx.x * 7 + 4] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];\n      block_boxes[threadIdx.x * 7 + 5] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];\n      block_boxes[threadIdx.x * 7 + 6] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;\n      const float *cur_box = boxes + cur_box_idx * 7;\n\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks =\n          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n      mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif  // IOU3D_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n#ifndef KNN_CUDA_KERNEL_CUH\n#define KNN_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ninline __device__ void swap_float(float *x, float *y) {\n  float tmp = *x;\n  *x = *y;\n  *y = tmp;\n}\n\ninline __device__ void swap_int(int *x, int *y) {\n  int tmp = *x;\n  *x = *y;\n  *y = tmp;\n}\n\n__device__ void reheap(float *dist, int *idx, int k) {\n  int root = 0;\n  int child = root * 2 + 1;\n  while (child < k) {\n    if (child + 1 < k && dist[child + 1] > dist[child]) child++;\n    if (dist[root] > dist[child]) return;\n    swap_float(&dist[root], &dist[child]);\n    swap_int(&idx[root], &idx[child]);\n    root = child;\n    child = root * 2 + 1;\n  }\n}\n\n__device__ void heap_sort(float *dist, int *idx, int k) {\n  int i;\n  for (i = k - 1; i > 0; i--) {\n    swap_float(&dist[0], &dist[i]);\n    swap_int(&idx[0], &idx[i]);\n    reheap(dist, idx, i);\n  }\n}\n\n// input: xyz (b, n, 3) new_xyz (b, m, 3)\n// output: idx (b, m, nsample) dist2 (b, m, nsample)\ntemplate <typename T>\n__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,\n                                        const T *xyz, const T *new_xyz,\n                                        int *__restrict__ idx, T *dist2) {\n  int bs_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b) return;\n\n    new_xyz += bs_idx * m * 3 + pt_idx * 3;\n    xyz += bs_idx * n * 3;\n    idx += bs_idx * m * nsample + pt_idx * nsample;\n    dist2 += bs_idx * m * nsample + pt_idx * nsample;\n\n    T new_x = new_xyz[0];\n    T new_y = new_xyz[1];\n    T new_z = new_xyz[2];\n\n    float best_dist[100];\n    int best_idx[100];\n    for (int i = 0; i < nsample; i++) {\n      best_dist[i] = 1e10;\n      best_idx[i] = 0;\n    }\n    for (int i = 0; i < n; i++) {\n      T x = xyz[i * 3 + 0];\n      T y = xyz[i * 3 + 1];\n      T z = xyz[i * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 < best_dist[0]) {\n        best_dist[0] = d2;\n        best_idx[0] = i;\n        reheap(best_dist, best_idx, nsample);\n      }\n    }\n    heap_sort(best_dist, best_idx, nsample);\n    for (int i = 0; i < nsample; i++) {\n      idx[i] = best_idx[i];\n      dist2[i] = best_dist[i];\n    }\n  }\n}\n\n#endif  // KNN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH\n#define MASKED_CONV2D_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename scalar_t>\n__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,\n                                    const int height, const int width,\n                                    const int kernel_h, const int kernel_w,\n                                    const int pad_h, const int pad_w,\n                                    const int64_t *mask_h_idx,\n                                    const int64_t *mask_w_idx,\n                                    const int mask_cnt, scalar_t *data_col) {\n  // mask_cnt * channels\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    const int m_index = index % mask_cnt;\n    const int h_col = mask_h_idx[m_index];\n    const int w_col = mask_w_idx[m_index];\n    const int c_im = index / mask_cnt;\n    const int c_col = c_im * kernel_h * kernel_w;\n    const int h_offset = h_col - pad_h;\n    const int w_offset = w_col - pad_w;\n    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;\n    for (int i = 0; i < kernel_h; ++i) {\n      int h_im = h_offset + i;\n      for (int j = 0; j < kernel_w; ++j) {\n        int w_im = w_offset + j;\n        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {\n          *data_col_ptr =\n              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];\n        } else {\n          *data_col_ptr = 0.0;\n        }\n        data_col_ptr += mask_cnt;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,\n                                    const int height, const int width,\n                                    const int channels,\n                                    const int64_t *mask_h_idx,\n                                    const int64_t *mask_w_idx,\n                                    const int mask_cnt, scalar_t *data_im) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    const int m_index = index % mask_cnt;\n    const int h_im = mask_h_idx[m_index];\n    const int w_im = mask_w_idx[m_index];\n    const int c_im = index / mask_cnt;\n    // compute the start and end of the output\n    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];\n  }\n}\n\n#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH\n#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n#define MAXN 20\n__device__ const float PI = 3.1415926;\n\nstruct Point {\n  float x, y;\n  __device__ Point() {}\n  __device__ Point(float x, float y) : x(x), y(y) {}\n};\n\n__device__ inline void swap1(Point *a, Point *b) {\n  Point temp;\n  temp.x = a->x;\n  temp.y = a->y;\n\n  a->x = b->x;\n  a->y = b->y;\n\n  b->x = temp.x;\n  b->y = temp.y;\n}\n__device__ inline float cross(Point o, Point a, Point b) {\n  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);\n}\n\n__device__ inline float dis(Point a, Point b) {\n  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);\n}\n__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {\n  float convex_points[2][MAXN];\n  for (int j = 0; j < n_points; j++) {\n    convex_points[0][j] = ps[j].x;\n  }\n  for (int j = 0; j < n_points; j++) {\n    convex_points[1][j] = ps[j].y;\n  }\n\n  Point edges[MAXN];\n  float edges_angles[MAXN];\n  float unique_angles[MAXN];\n  int n_edges = n_points - 1;\n  int n_unique = 0;\n  int unique_flag = 0;\n\n  for (int i = 0; i < n_edges; i++) {\n    edges[i].x = ps[i + 1].x - ps[i].x;\n    edges[i].y = ps[i + 1].y - ps[i].y;\n  }\n  for (int i = 0; i < n_edges; i++) {\n    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);\n    if (edges_angles[i] >= 0) {\n      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);\n    } else {\n      edges_angles[i] =\n          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);\n    }\n  }\n  unique_angles[0] = edges_angles[0];\n  n_unique += 1;\n  for (int i = 1; i < n_edges; i++) {\n    for (int j = 0; j < n_unique; j++) {\n      if (edges_angles[i] == unique_angles[j]) {\n        unique_flag += 1;\n      }\n    }\n    if (unique_flag == 0) {\n      unique_angles[n_unique] = edges_angles[i];\n      n_unique += 1;\n      unique_flag = 0;\n    } else {\n      unique_flag = 0;\n    }\n  }\n\n  float minarea = 1e12;\n  for (int i = 0; i < n_unique; i++) {\n    float R[2][2];\n    float rot_points[2][MAXN];\n    R[0][0] = cos(unique_angles[i]);\n    R[0][1] = sin(unique_angles[i]);\n    R[1][0] = -sin(unique_angles[i]);\n    R[1][1] = cos(unique_angles[i]);\n    // R x Points\n    for (int m = 0; m < 2; m++) {\n      for (int n = 0; n < n_points; n++) {\n        float sum = 0.0;\n        for (int k = 0; k < 2; k++) {\n          sum = sum + R[m][k] * convex_points[k][n];\n        }\n        rot_points[m][n] = sum;\n      }\n    }\n\n    // xmin;\n    float xmin, ymin, xmax, ymax;\n    xmin = 1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {\n        continue;\n      } else {\n        if (rot_points[0][j] < xmin) {\n          xmin = rot_points[0][j];\n        }\n      }\n    }\n    // ymin\n    ymin = 1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {\n        continue;\n      } else {\n        if (rot_points[1][j] < ymin) {\n          ymin = rot_points[1][j];\n        }\n      }\n    }\n    // xmax\n    xmax = -1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {\n        continue;\n      } else {\n        if (rot_points[0][j] > xmax) {\n          xmax = rot_points[0][j];\n        }\n      }\n    }\n    // ymax\n    ymax = -1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {\n        continue;\n      } else {\n        if (rot_points[1][j] > ymax) {\n          ymax = rot_points[1][j];\n        }\n      }\n    }\n    float area = (xmax - xmin) * (ymax - ymin);\n    if (area < minarea) {\n      minarea = area;\n      minbox[0] = unique_angles[i];\n      minbox[1] = xmin;\n      minbox[2] = ymin;\n      minbox[3] = xmax;\n      minbox[4] = ymax;\n    }\n  }\n}\n\n// convex_find\n__device__ inline void Jarvis(Point *in_poly, int &n_poly) {\n  int n_input = n_poly;\n  Point input_poly[20];\n  for (int i = 0; i < n_input; i++) {\n    input_poly[i].x = in_poly[i].x;\n    input_poly[i].y = in_poly[i].y;\n  }\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[20], top1, top2;\n  // float sign;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point *j = &(in_poly[0]);\n      Point *k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n\n  for (int i = 0; i <= top1; i++) {\n    right_point[i] = in_poly[Stack[i]];\n  }\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n\n  for (int i = top2 - 1; i >= 0; i--) {\n    left_point[i] = in_poly[Stack[i]];\n  }\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n}\n\ntemplate <typename T>\n__device__ inline void Findminbox(T const *const p, T *minpoints) {\n  Point ps1[MAXN];\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = p[i * 2];\n    convex[i].y = p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  Jarvis(convex, n_convex);\n  int n1 = n_convex;\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = convex[i].x;\n    ps1[i].y = convex[i].y;\n  }\n  ps1[n1].x = convex[0].x;\n  ps1[n1].y = convex[0].y;\n\n  float minbbox[5] = {0};\n  minBoundingRect(ps1, n1 + 1, minbbox);\n  float angle = minbbox[0];\n  float xmin = minbbox[1];\n  float ymin = minbbox[2];\n  float xmax = minbbox[3];\n  float ymax = minbbox[4];\n  float R[2][2];\n\n  R[0][0] = cos(angle);\n  R[0][1] = sin(angle);\n  R[1][0] = -sin(angle);\n  R[1][1] = cos(angle);\n\n  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];\n  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];\n  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];\n  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];\n  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];\n  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];\n  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];\n  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];\n}\n\ntemplate <typename T>\n__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,\n                                              const T *ex_boxes, T *minbox) {\n  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T *cur_box = ex_boxes + index * 18;\n    T *cur_min_box = minbox + index * 8;\n    Findminbox(cur_box, cur_min_box);\n  }\n}\n\n#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh",
    "content": "/*!\n ******************* BEGIN Caffe Copyright Notice and Disclaimer\n *****************\n *\n * COPYRIGHT\n *\n * All contributions by the University of California:\n * Copyright (c) 2014-2017 The Regents of the University of California (Regents)\n * All rights reserved.\n *\n * All other contributions:\n * Copyright (c) 2014-2017, the respective contributors\n * All rights reserved.\n *\n * Caffe uses a shared copyright model: each contributor holds copyright over\n * their contributions to Caffe. The project versioning records all such\n * contribution and copyright details. If a contributor wants to further mark\n * their specific copyright on a particular contribution, they should indicate\n * their copyright solely in the commit message of the change when it is\n * committed.\n *\n * LICENSE\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions are met:\n *\n * 1. Redistributions of source code must retain the above copyright notice,\n *this list of conditions and the following disclaimer.\n * 2. Redistributions in binary form must reproduce the above copyright notice,\n * this list of conditions and the following disclaimer in the documentation\n * and/or other materials provided with the distribution.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n * CONTRIBUTION AGREEMENT\n *\n * By contributing to the BVLC/caffe repository through pull-request, comment,\n * or otherwise, the contributor releases their content to the\n * license and copyright terms herein.\n *\n ***************** END Caffe Copyright Notice and Disclaimer\n *********************\n *\n * Copyright (c) 2018 Microsoft\n * Licensed under The MIT License [see LICENSE for details]\n * \\file modulated_deformable_im2col.cuh\n * \\brief Function definitions of converting an image to\n * column matrix based on kernel, padding, dilation, and offset.\n * These functions are mainly used in deformable convolution operators.\n * \\ref: https://arxiv.org/abs/1703.06211\n * \\author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng\n */\n\n// modified from\n// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu\n\n#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH\n#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\ntemplate <typename T>\n__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,\n                                  const int height, const int width, T h, T w) {\n  int h_low = floorf(h);\n  int w_low = floorf(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\n__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,\n                                      const int w, const int height,\n                                      const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\n__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,\n                                        const int height, const int width,\n                                        const T *im_data, const int data_width,\n                                        const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_im2col_gpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const T *data_mask,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const T *data_mask_ptr =\n        data_mask + (b_col * deformable_group + deformable_group_index) *\n                        kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const int data_mask_hw_ptr =\n            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        const T mask = data_mask_ptr[data_mask_hw_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,\n                                     w_im);\n        *data_col_ptr = val * mask;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_col2im_gpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const T *data_mask,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const int data_mask_hw_ptr =\n        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T mask = data_mask_ptr[data_mask_hw_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index] * mask;\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight =\n              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,\n                                       cur_h + dy, cur_w + dx, height, width);\n          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_col2im_coord_gpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const T *data_mask, const int channels, const int height, const int width,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int channel_per_deformable_group,\n    const int batch_size, const int offset_channels, const int deformable_group,\n    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    T val = 0, mval = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const int data_mask_hw_ptr =\n          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      const T mask = data_mask_ptr[data_mask_hw_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      else\n        mval += data_col_ptr[col_pos] *\n                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,\n                                     height, width, inv_h, inv_w);\n      const T weight = dmcn_get_coordinate_weight(\n          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,\n          width, bp_dir);\n      val += weight * data_col_ptr[col_pos] * mask;\n      cnt += 1;\n    }\n    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);\n    grad_offset[index] = val;\n    if (offset_c % 2 == 0)\n      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +\n      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *\n      // height_col + h) * width_col + w], mask_req, mval);\n      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *\n                      kernel_w +\n                  offset_c / 2) *\n                     height_col +\n                 h) *\n                    width_col +\n                w] = mval;\n  }\n}\n\n#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n#ifndef DEFORM_ATTN_CUDA_KERNEL\n#define DEFORM_ATTN_CUDA_KERNEL\n\n#include \"common_cuda_helper.hpp\"\n#include \"pytorch_cuda_helper.hpp\"\n\ntemplate <typename scalar_t>\n__device__ scalar_t ms_deform_attn_im2col_bilinear(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n  }\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,\n    const scalar_t &attn_weight, scalar_t *&grad_value,\n    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  *grad_attn_weight = top_grad * val;\n  *grad_sampling_loc = width * grad_w_weight * top_grad_value;\n  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;\n}\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear_gm(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,\n    const scalar_t &attn_weight, scalar_t *&grad_value,\n    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  atomicAdd(grad_attn_weight, top_grad * val);\n  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);\n  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_im2col_gpu_kernel(\n    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,\n    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,\n    const scalar_t *data_attn_weight, const int batch_size,\n    const int spatial_size, const int num_heads, const int channels,\n    const int num_levels, const int num_query, const int num_point,\n    scalar_t *data_col) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    scalar_t *data_col_ptr = data_col + index;\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n    scalar_t col = 0;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const scalar_t *data_value_ptr =\n          data_value +\n          (data_value_ptr_init_offset + level_start_id * qid_stride);\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,\n                                                spatial_w, num_heads, channels,\n                                                h_im, w_im, m_col, c_col) *\n                 weight;\n        }\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n      }\n    }\n    *data_col_ptr = col;\n  }\n}\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n  __shared__ scalar_t cache_grad_attn_weight[blockSize];\n  unsigned int tid = threadIdx.x;\n  const int qid_stride = num_heads * channels;\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          scalar_t _grad_w = cache_grad_sampling_loc[0],\n                   _grad_h = cache_grad_sampling_loc[1],\n                   _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[_tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc_out = _grad_w;\n          *(grad_sampling_loc_out + 1) = _grad_h;\n          *grad_attn_weight_out = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n  __shared__ scalar_t cache_grad_attn_weight[blockSize];\n  unsigned int tid = threadIdx.x;\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc_out = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight_out = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          scalar_t _grad_w = cache_grad_sampling_loc[0],\n                   _grad_h = cache_grad_sampling_loc[1],\n                   _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[_tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc_out = _grad_w;\n          *(grad_sampling_loc_out + 1) = _grad_h;\n          *grad_attn_weight_out = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc_out = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight_out = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);\n          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);\n          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_gm(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  CUDA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear_gm(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              grad_sampling_loc_out, grad_attn_weight_out);\n        }\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n#endif  // DEFORM_ATTN_CUDA_KERNEL\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef NMS_CUDA_KERNEL_CUH\n#define NMS_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\nint const threadsPerBlock = sizeof(unsigned long long int) * 8;\n\n__device__ inline bool devIoU(float const *const a, float const *const b,\n                              const int offset, const float threshold) {\n  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);\n  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);\n  float width = fmaxf(right - left + offset, 0.f),\n        height = fmaxf(bottom - top + offset, 0.f);\n  float interS = width * height;\n  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);\n  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);\n  return interS > threshold * (Sa + Sb - interS);\n}\n\n__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,\n                                const int offset, const float *dev_boxes,\n                                unsigned long long *dev_mask) {\n  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;\n  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    const int tid = threadIdx.x;\n\n    if (row_start > col_start) return;\n\n    const int row_size =\n        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    __shared__ float block_boxes[threadsPerBlock * 4];\n    if (tid < col_size) {\n      block_boxes[tid * 4 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];\n      block_boxes[tid * 4 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];\n      block_boxes[tid * 4 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];\n      block_boxes[tid * 4 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];\n    }\n    __syncthreads();\n\n    if (tid < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + tid;\n      const float *cur_box = dev_boxes + cur_box_idx * 4;\n      int i = 0;\n      unsigned long long int t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = tid + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {\n          t |= 1ULL << i;\n        }\n      }\n      dev_mask[cur_box_idx * gridDim.y + col_start] = t;\n    }\n  }\n}\n\n__global__ static void gather_keep_from_mask(bool *keep,\n                                             const unsigned long long *dev_mask,\n                                             const int n_boxes) {\n  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;\n  const int tid = threadIdx.x;\n\n  // mark the bboxes which have been removed.\n  extern __shared__ unsigned long long removed[];\n\n  // initialize removed.\n  for (int i = tid; i < col_blocks; i += blockDim.x) {\n    removed[i] = 0;\n  }\n  __syncthreads();\n\n  for (int nblock = 0; nblock < col_blocks; ++nblock) {\n    auto removed_val = removed[nblock];\n    __syncthreads();\n    const int i_offset = nblock * threadsPerBlock;\n#pragma unroll\n    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {\n      const int i = i_offset + inblock;\n      if (i >= n_boxes) break;\n      // select a candidate, check if it should kept.\n      if (!(removed_val & (1ULL << inblock))) {\n        if (tid == 0) {\n          // mark the output.\n          keep[i] = true;\n        }\n        auto p = dev_mask + i * col_blocks;\n        // remove all bboxes which overlap the candidate.\n        for (int j = tid; j < col_blocks; j += blockDim.x) {\n          if (j >= nblock) removed[j] |= p[j];\n        }\n        __syncthreads();\n        removed_val = removed[nblock];\n      }\n    }\n  }\n}\n\n#endif  // NMS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef NMS_QUADRI_CUDA_CUH\n#define NMS_QUADRI_CUDA_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include \"box_iou_rotated_utils.hpp\"\n\n__host__ __device__ inline int divideUP(const int x, const int y) {\n  return (((x) + (y)-1) / (y));\n}\n\nnamespace {\nint const threadsPerBlock = sizeof(unsigned long long) * 8;\n}\n\ntemplate <typename T>\n__global__ void nms_quadri_cuda_kernel(const int n_boxes,\n                                       const float iou_threshold,\n                                       const T* dev_boxes,\n                                       unsigned long long* dev_mask,\n                                       const int multi_label) {\n  if (multi_label == 1) {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 8 values\n    // (x1, y1, ..., x4, y4) here.\n    __shared__ T block_boxes[threadsPerBlock * 8];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 8 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];\n      block_boxes[threadIdx.x * 8 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];\n      block_boxes[threadIdx.x * 8 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];\n      block_boxes[threadIdx.x * 8 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];\n      block_boxes[threadIdx.x * 8 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];\n      block_boxes[threadIdx.x * 8 + 5] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];\n      block_boxes[threadIdx.x * 8 + 6] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];\n      block_boxes[threadIdx.x * 8 + 7] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 9;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_quadri function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  } else {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 8 values\n    // (x1, y1, , ..., x4, y4) here.\n    __shared__ T block_boxes[threadsPerBlock * 8];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 8 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];\n      block_boxes[threadIdx.x * 8 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];\n      block_boxes[threadIdx.x * 8 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];\n      block_boxes[threadIdx.x * 8 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];\n      block_boxes[threadIdx.x * 8 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];\n      block_boxes[threadIdx.x * 8 + 5] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];\n      block_boxes[threadIdx.x * 8 + 6] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];\n      block_boxes[threadIdx.x * 8 + 7] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 8;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_quadri function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu\n#ifndef NMS_ROTATED_CUDA_CUH\n#define NMS_ROTATED_CUDA_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include \"box_iou_rotated_utils.hpp\"\n\n__host__ __device__ inline int divideUP(const int x, const int y) {\n  return (((x) + (y)-1) / (y));\n}\n\nnamespace {\nint const threadsPerBlock = sizeof(unsigned long long) * 8;\n}\n\ntemplate <typename T>\n__global__ void nms_rotated_cuda_kernel(const int n_boxes,\n                                        const float iou_threshold,\n                                        const T* dev_boxes,\n                                        unsigned long long* dev_mask,\n                                        const int multi_label) {\n  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel\n\n  if (multi_label == 1) {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 5 values\n    // (x_center, y_center, width, height, angle_degrees) here.\n    __shared__ T block_boxes[threadsPerBlock * 5];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 5 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];\n      block_boxes[threadIdx.x * 5 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];\n      block_boxes[threadIdx.x * 5 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];\n      block_boxes[threadIdx.x * 5 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];\n      block_boxes[threadIdx.x * 5 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 6;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_rotated function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  } else {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 5 values\n    // (x_center, y_center, width, height, angle_degrees) here.\n    __shared__ T block_boxes[threadsPerBlock * 5];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 5 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];\n      block_boxes[threadIdx.x * 5 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];\n      block_boxes[threadIdx.x * 5 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];\n      block_boxes[threadIdx.x * 5 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];\n      block_boxes[threadIdx.x * 5 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 5;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_rotated function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh",
    "content": "/*\n * Copyright (c) 2019, SenseTime.\n */\n\n#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_\n#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_\n\n#ifndef __CUDACC__\n#error cudawarpfunction.cuh should only be included by .cu files\n#endif\n#include <cuda.h>\n\n#include <parrots/foundation/common.hpp>\n\n#ifdef PARROTS_USE_HALF\n#include <cuda_fp16.h>\n#endif\n#ifdef __CUDA_ARCH__\n#define CUDA_INTRINSIC_FUNC(Expr) Expr\n#else\n#define CUDA_INTRINSIC_FUNC(Expr)\n#endif\n\n#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300\n\n#ifdef PARROTS_USE_HALF\n\n#if CUDA_VERSION < 9000\n\n__device__ inline float16 __shfl(float16 var, int srcLane, int width) {\n  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););\n}\n\n__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {\n  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););\n}\n\n__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {\n  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););\n}\n\n__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {\n  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););\n}\n\n#else  // CUDA_VERSION >= 9000\n\n__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,\n                                      int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);\n                      return r;);\n}\n\n__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,\n                                         unsigned delta, int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(\n      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);\n}\n\n__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,\n                                           unsigned delta,\n                                           int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(\n      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);\n}\n\n__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,\n                                          int laneMask, int width) {\n  CUDA_INTRINSIC_FUNC(float16 r;\n                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);\n                      return r;);\n}\n\n#endif  // CUDA_VERSION < 9000\n\n#endif  // PARROTS_USE_HALF\n\n// warp shuffle interface with a dummy mask\n#if CUDA_VERSION < 9000\n\ntemplate <typename T>\n__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,\n                                int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););\n}\n\ntemplate <typename T>\n__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,\n                                   int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););\n}\n\ntemplate <typename T>\n__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,\n                                     int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););\n}\n\ntemplate <typename T>\n__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,\n                                    int width = warpSize) {\n  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););\n}\n\n#endif  // CUDA_VERSION < 9000\n\n#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300\n\n#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH\n#define POINT_IN_BOXES_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,\n  // cz in the bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];\n  cz += z_size /\n        2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > z_size / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &\n                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void points_in_boxes_part_forward_cuda_kernel(\n    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,\n    int *box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:\n  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:\n  // (B, npoints), default -1\n\n  int bs_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (bs_idx >= batch_size) return;\n\n    boxes += bs_idx * boxes_num * 7;\n    pts += bs_idx * pts_num * 3 + pt_idx * 3;\n    box_idx_of_points += bs_idx * pts_num + pt_idx;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = 0;\n    for (int k = 0; k < boxes_num; k++) {\n      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);\n      if (cur_in_flag) {\n        box_idx_of_points[0] = k;\n        break;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void points_in_boxes_all_forward_cuda_kernel(\n    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,\n    int *box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:\n  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:\n  // (B, npoints), default -1\n\n  int bs_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (bs_idx >= batch_size) return;\n\n    boxes += bs_idx * boxes_num * 7;\n    pts += bs_idx * pts_num * 3 + pt_idx * 3;\n    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;\n\n    T local_x = 0, local_y = 0;\n    for (int k = 0; k < boxes_num; k++) {\n      const int cur_in_flag =\n          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);\n      if (cur_in_flag) {\n        box_idx_of_points[k] = 1;\n      }\n    }\n  }\n}\n\n#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH\n#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\nstruct point {\n  float x, y;\n};\n\ntemplate <typename scalar_t>\n__global__ void points_in_polygons_forward_cuda_kernel(\n    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,\n    const int rows, const int cols, scalar_t *inside_flag) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int row = index / cols;\n    int col = index % cols;\n\n    const scalar_t *offset_vertex1 = vertex1 + row * 2;\n    const scalar_t *offset_vertex2 = vertex2 + col * 8;\n\n    point point_[1];\n    point polygon[4];\n\n    point_[0].x = offset_vertex1[0];\n    point_[0].y = offset_vertex1[1];\n\n    polygon[0].x = offset_vertex2[0];\n    polygon[0].y = offset_vertex2[1];\n    polygon[1].x = offset_vertex2[2];\n    polygon[1].y = offset_vertex2[3];\n    polygon[2].x = offset_vertex2[4];\n    polygon[2].y = offset_vertex2[5];\n    polygon[3].x = offset_vertex2[6];\n    polygon[3].y = offset_vertex2[7];\n\n    int nCross = 0;\n    int i, j;\n    float sx, sy, tx, ty, px, py, x;\n    for (i = 0, j = 3; i < 4; j = i, i++) {\n      sx = polygon[i].x;\n      sy = polygon[i].y;\n      tx = polygon[j].x;\n      ty = polygon[j].y;\n\n      px = point_[0].x;\n      py = point_[0].y;\n\n      if (py < min(sy, ty)) continue;\n      if (py > max(sy, ty)) continue;\n\n      if ((sx == px && sy == py) || (tx == px && ty == py)) {\n        break;\n      } else {\n        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {\n          x = sx + (py - sy) * (tx - sx) / (ty - sy);\n          if (x == px) {\n            break;\n          }\n          if (x > px) {\n            nCross++;\n          }\n        }\n      }\n    }\n    if (nCross % 2 == 1) {\n      inside_flag[index] = 1.0;\n    } else {\n      inside_flag[index] = 0.0;\n    }\n    return;\n  }\n}\n\n#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu\n// Distributed under terms of the MIT license.\n#ifndef PRROI_POOL_CUDA_KERNEL_CUH\n#define PRROI_POOL_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,\n                                                        const int h,\n                                                        const int w,\n                                                        const int height,\n                                                        const int width) {\n  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);\n  T retVal = overflow ? 0.0f : data[h * width + w];\n  return retVal;\n}\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {\n  return (1.0f - abs(dh)) * (1.0f - abs(dw));\n}\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,\n                                                                   T c1, T c2) {\n  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;\n}\n\ntemplate <typename T>\n__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,\n                                              const T w, const int height,\n                                              const int width) {\n  T retVal = 0.0f;\n  int h1 = floorf(h);\n  int w1 = floorf(w);\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h) + 1;\n  w1 = floorf(w);\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h);\n  w1 = floorf(w) + 1;\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h) + 1;\n  w1 = floorf(w) + 1;\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  return retVal;\n}\n\ntemplate <typename T>\n__device__ static T PrRoIPoolingMatCalculation(const T *this_data,\n                                               const int s_h, const int s_w,\n                                               const int e_h, const int e_w,\n                                               const T y0, const T x0,\n                                               const T y1, const T x1,\n                                               const int h0, const int w0) {\n  T alpha, beta, lim_alpha, lim_beta, tmp;\n  T sum_out = 0;\n\n  alpha = x0 - T(s_w);\n  beta = y0 - T(s_h);\n  lim_alpha = x1 - T(s_w);\n  lim_beta = y1 - T(s_h);\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;\n\n  alpha = x0 - T(s_w);\n  beta = T(e_h) - y1;\n  lim_alpha = x1 - T(s_w);\n  lim_beta = T(e_h) - y0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;\n\n  return sum_out;\n}\n\ntemplate <typename T>\n__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,\n                                                  const int h, const int w,\n                                                  const int height,\n                                                  const int width,\n                                                  const T coeff) {\n  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);\n  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);\n}\n\ntemplate <typename T>\n__device__ static void PrRoIPoolingMatDistributeDiff(\n    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,\n    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,\n    const int w0) {\n  T alpha, beta, lim_alpha, lim_beta, tmp;\n\n  alpha = x0 - T(s_w);\n  beta = y0 - T(s_h);\n  lim_alpha = x1 - T(s_w);\n  lim_beta = y1 - T(s_h);\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);\n\n  alpha = x0 - T(s_w);\n  beta = T(e_h) - y1;\n  lim_alpha = x1 - T(s_w);\n  lim_beta = T(e_h) - y0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_forward_cuda_kernel(\n    const int nthreads, const T *input, const T *rois, T *output,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T *offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    T roi_x1 = offset_rois[1] * spatial_scale;\n    T roi_y1 = offset_rois[2] * spatial_scale;\n    T roi_x2 = offset_rois[3] * spatial_scale;\n    T roi_y2 = offset_rois[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));\n    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T *this_data =\n        input + (roi_batch_ind * channels + c) * height * width;\n    T *this_out = output + index;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n    if (bin_size == 0) {\n      *this_out = 0;\n      continue;\n    }\n\n    T sum_out = 0;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x)\n      for (int bin_y = start_y; bin_y < end_y; ++bin_y)\n        sum_out += PrRoIPoolingMatCalculation(\n            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,\n            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),\n            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,\n            width);\n    *this_out = sum_out / bin_size;\n  }\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_backward_cuda_kernel(\n    const int nthreads, const T *grad_output, const T *rois, T *grad_input,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    auto rois_cur = rois + n * 5;\n\n    int roi_batch_ind = rois_cur[0];\n    T roi_x1 = rois_cur[1] * spatial_scale;\n    T roi_y1 = rois_cur[2] * spatial_scale;\n    T roi_x2 = rois_cur[3] * spatial_scale;\n    T roi_y2 = rois_cur[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, (T)0);\n    T roi_height = max(roi_y2 - roi_y1, (T)0);\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T *this_out_grad = grad_output + index;\n    T *this_data_grad =\n        grad_input + (roi_batch_ind * channels + c) * height * width;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n\n    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x)\n      for (int bin_y = start_y; bin_y < end_y; ++bin_y)\n        PrRoIPoolingMatDistributeDiff(\n            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,\n            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),\n            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,\n            width);\n  }\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_coor_backward_cuda_kernel(\n    const int nthreads, const T *output, const T *grad_output, const T *input,\n    const T *rois, T *grad_rois, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int channels,\n    const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    auto rois_cur = rois + n * 5;\n\n    int roi_batch_ind = rois_cur[0];\n    T roi_x1 = rois_cur[1] * spatial_scale;\n    T roi_y1 = rois_cur[2] * spatial_scale;\n    T roi_x2 = rois_cur[3] * spatial_scale;\n    T roi_y2 = rois_cur[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, (T)0);\n    T roi_height = max(roi_y2 - roi_y1, (T)0);\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T output_grad_val = grad_output[index];\n    const T *this_input_data =\n        input + (roi_batch_ind * channels + c) * height * width;\n    const T output_val = output[index];\n    T *this_rois_grad = grad_rois + n * 5;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n\n    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;\n\n    // WARNING: to be discussed\n    if (sum_out == 0) continue;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;\n    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {\n      grad_x1_y += PrRoIPoolingSingleCoorIntegral(\n          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,\n                                    height, width));\n\n      grad_x2_y += PrRoIPoolingSingleCoorIntegral(\n          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,\n                                    height, width));\n    }\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {\n      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(\n          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,\n          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),\n                                    height, width));\n\n      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(\n          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,\n          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),\n                                    height, width));\n    }\n\n    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;\n    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;\n    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;\n    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;\n\n    partial_x1 = partial_x1 / bin_size * spatial_scale;\n    partial_x2 = partial_x2 / bin_size * spatial_scale;\n    partial_y1 = partial_y1 / bin_size * spatial_scale;\n    partial_y2 = partial_y2 / bin_size * spatial_scale;\n\n    // (index, x1, y1, x2, y2)\n    this_rois_grad[0] = 0;\n    atomicAdd(this_rois_grad + 1,\n              (partial_x1 * (1.0f - T(pw) / pooled_width) +\n               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *\n                  output_grad_val);\n    atomicAdd(this_rois_grad + 2,\n              (partial_y1 * (1.0f - T(ph) / pooled_height) +\n               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *\n                  output_grad_val);\n    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +\n                                   partial_x1 * T(pw) / pooled_width) *\n                                      output_grad_val);\n    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +\n                                   partial_y1 * T(ph) / pooled_height) *\n                                      output_grad_val);\n  }\n}\n\n#endif  // ROI_POOL_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef PSAMASK_CUDA_KERNEL_CUH\n#define PSAMASK_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\n// CUDA: grid stride looping\n#ifndef CUDA_KERNEL_LOOP\n#define CUDA_KERNEL_LOOP(i, n)                                 \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \\\n       i += blockDim.x * gridDim.x)\n#endif\n\ntemplate <typename T>\n__global__ void psamask_collect_forward_cuda(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* mask_data, T* buffer_data) {\n  CUDA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        buffer_data[(n * h_feature * w_feature +\n                     (hidx + h - half_h_mask) * w_feature +\n                     (widx + w - half_w_mask)) *\n                        h_feature * w_feature +\n                    h * w_feature + w] = mask_data\n            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *\n                 w_feature +\n             w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_distribute_forward_cuda(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* mask_data, T* buffer_data) {\n  CUDA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *\n                        h_feature * w_feature +\n                    (hidx + h - half_h_mask) * w_feature +\n                    (widx + w - half_w_mask)] = mask_data\n            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *\n                 w_feature +\n             w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_collect_backward_cuda(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* buffer_diff, T* mask_diff) {\n  CUDA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +\n                   h) *\n                      w_feature +\n                  w] = buffer_diff[(n * h_feature * w_feature +\n                                    (hidx + h - half_h_mask) * w_feature +\n                                    (widx + w - half_w_mask)) *\n                                       h_feature * w_feature +\n                                   h * w_feature + w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_distribute_backward_cuda(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* buffer_diff, T* mask_diff) {\n  CUDA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +\n                   h) *\n                      w_feature +\n                  w] =\n            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *\n                            h_feature * w_feature +\n                        (hidx + h - half_h_mask) * w_feature +\n                        (widx + w - half_w_mask)];\n      }\n    }\n  }\n}\n\n#endif  // PSAMASK_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh",
    "content": "// Modified from\n// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu\n#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n\n/*** Forward ***/\ntemplate <typename scalar_t>\n__global__ void riroi_align_rotated_forward_cuda_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_rois, const scalar_t spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int num_orientations, scalar_t *top_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int o = (index / pooled_width / pooled_height) % num_orientations;\n    int c =\n        (index / pooled_width / pooled_height / num_orientations) % channels;\n    int n = index / pooled_width / pooled_height / num_orientations / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    // Force malformed ROIs to be 1x1\n    roi_width = max(roi_width, (scalar_t)1.);\n    roi_height = max(roi_height, (scalar_t)1.);\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    // find aligned index\n    scalar_t ind_float = theta * num_orientations / (2 * M_PI);\n    int ind = floorf(ind_float);\n    scalar_t l_var = ind_float - (scalar_t)ind;\n    scalar_t r_var = 1.0 - l_var;\n    // correct start channel\n    ind = (ind + num_orientations) % num_orientations;\n    // rotated channel\n    int ind_rot = (o - ind + num_orientations) % num_orientations;\n    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;\n    const scalar_t *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot) *\n                          height * width;\n\n    const scalar_t *offset_bottom_data_plus =\n        bottom_data + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot_plus) *\n                          height * width;\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (num_samples > 0)\n                             ? num_samples\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosscalar_theta = cos(theta);\n    scalar_t sinscalar_theta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    scalar_t output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta (counterclockwise) around the center and translate\n        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;\n        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;\n\n        scalar_t val = bilinear_interpolate<scalar_t>(\n            offset_bottom_data, height, width, y, x, index);\n        scalar_t val_plus = bilinear_interpolate<scalar_t>(\n            offset_bottom_data_plus, height, width, y, x, index);\n        output_val += r_var * val + l_var * val_plus;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename scalar_t>\n__global__ void riroi_align_rotated_backward_cuda_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,\n    const scalar_t spatial_scale, const int num_samples, const bool clockwise,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    scalar_t *bottom_diff) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int o = (index / pooled_width / pooled_height) % num_orientations;\n    int c =\n        (index / pooled_width / pooled_height / num_orientations) % channels;\n    int n = index / pooled_width / pooled_height / num_orientations / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not round\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    // Force malformed ROIs to be 1x1\n    roi_width = max(roi_width, (scalar_t)1.);\n    roi_height = max(roi_height, (scalar_t)1.);\n\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    // find aligned index\n    scalar_t ind_float = theta * num_orientations / (2 * M_PI);\n    int ind = floorf(ind_float);\n    scalar_t l_var = ind_float - (scalar_t)ind;\n    scalar_t r_var = 1.0 - l_var;\n    // correct start channel\n    ind = (ind + num_orientations) % num_orientations;\n    // rotated channel\n    int ind_rot = (o - ind + num_orientations) % num_orientations;\n    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;\n    scalar_t *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot) *\n                          height * width;\n    scalar_t *offset_bottom_diff_plus =\n        bottom_diff + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot_plus) *\n                          height * width;\n    int top_offset =\n        (n * channels * num_orientations + c * num_orientations + o) *\n        pooled_height * pooled_width;\n    const scalar_t *offset_top_diff = top_diff + top_offset;\n    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (num_samples > 0)\n                             ? num_samples\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosTheta = cos(theta);\n    scalar_t sinTheta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta around the center and translate\n        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;\n        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;\n\n        scalar_t w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,\n                                                w4, x_low, x_high, y_low,\n                                                y_high, index);\n\n        scalar_t g1 = top_diff_this_bin * w1 / count;\n        scalar_t g2 = top_diff_this_bin * w2 / count;\n        scalar_t g3 = top_diff_this_bin * w3 / count;\n        scalar_t g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);\n          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);\n          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);\n          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);\n\n          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,\n                    g1 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,\n                    g2 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,\n                    g3 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,\n                    g4 * l_var);\n\n        }  // if\n      }  // ix\n    }  // iy\n  }  // CUDA_1D_KERNEL_LOOP\n}  // RiRoIAlignBackward\n\n#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_ALIGN_CUDA_KERNEL_CUH\n#define ROI_ALIGN_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\n/*** Forward ***/\ntemplate <typename T>\n__global__ void roi_align_forward_cuda_kernel(\n    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,\n    T* argmax_x, const int pooled_height, const int pooled_width,\n    const T spatial_scale, const int sampling_ratio,\n    const int pool_mode,  // 0 - max pool, 1 - avg pool\n    const bool aligned, const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_start_w = offset_rois[1] * spatial_scale - offset;\n    T roi_start_h = offset_rois[2] * spatial_scale - offset;\n    T roi_end_w = offset_rois[3] * spatial_scale - offset;\n    T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    if (pool_mode == 0) {\n      // We do max pooling inside a bin\n      T maxval = -FLT_MAX;\n      T maxidx_y = -1.f, maxidx_x = -1.f;\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n          T val =\n              bilinear_interpolate(offset_input, height, width, y, x, index);\n          if (val > maxval) {\n            maxval = val;\n            maxidx_y = y;\n            maxidx_x = x;\n          }\n        }\n      }\n      output[index] = maxval;\n      argmax_y[index] = maxidx_y;\n      argmax_x[index] = maxidx_x;\n    } else if (pool_mode == 1) {\n      // We do average pooling inside a bin\n      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);\n      T output_val = 0.;\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n          T val =\n              bilinear_interpolate(offset_input, height, width, y, x, index);\n          output_val += val;\n        }\n      }\n      output[index] = output_val / count;\n    }\n  }\n}\n\n/*** Backward ***/\ntemplate <typename T>\n__global__ void roi_align_backward_cuda_kernel(\n    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,\n    const T* argmax_x, T* grad_input, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int sampling_ratio,\n    const int pool_mode,  // 0 - max pool, 1 - avg pool\n    const bool aligned, const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T grad_output_this_bin = grad_output[index];\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    if (pool_mode == 0) {\n      T y = argmax_y[index], x = argmax_x[index];\n      if (y != -1.f) {\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_grad_input + y_low * width + x_low,\n                    grad_output_this_bin * w1);\n          atomicAdd(offset_grad_input + y_low * width + x_high,\n                    grad_output_this_bin * w2);\n          atomicAdd(offset_grad_input + y_high * width + x_low,\n                    grad_output_this_bin * w3);\n          atomicAdd(offset_grad_input + y_high * width + x_high,\n                    grad_output_this_bin * w4);\n        }\n      }\n    } else if (pool_mode == 1) {\n      // Do not using rounding; this implementation detail is critical\n      T offset = aligned ? (T)0.5 : (T)0.0;\n      T roi_start_w = offset_rois[1] * spatial_scale - offset;\n      T roi_start_h = offset_rois[2] * spatial_scale - offset;\n      T roi_end_w = offset_rois[3] * spatial_scale - offset;\n      T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n      T roi_width = roi_end_w - roi_start_w;\n      T roi_height = roi_end_h - roi_start_h;\n      if (!aligned) {  // for backward-compatibility only\n        roi_width = max(roi_width, (T)1.);\n        roi_height = max(roi_height, (T)1.);\n      }\n\n      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n      // We use roi_bin_grid to sample the grid and mimic integral\n      int roi_bin_grid_h =\n          (sampling_ratio > 0)\n              ? sampling_ratio\n              : static_cast<int>(ceilf(roi_height / pooled_height));\n      int roi_bin_grid_w =\n          (sampling_ratio > 0)\n              ? sampling_ratio\n              : static_cast<int>(ceilf(roi_width / pooled_width));\n\n      // We do average (integral) pooling inside a bin\n      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n\n          T w1, w2, w3, w4;\n          int x_low, x_high, y_low, y_high;\n          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                        x_low, x_high, y_low, y_high, index);\n\n          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n            atomicAdd(offset_grad_input + y_low * width + x_low,\n                      grad_output_this_bin * w1 / count);\n            atomicAdd(offset_grad_input + y_low * width + x_high,\n                      grad_output_this_bin * w2 / count);\n            atomicAdd(offset_grad_input + y_high * width + x_low,\n                      grad_output_this_bin * w3 / count);\n            atomicAdd(offset_grad_input + y_high * width + x_high,\n                      grad_output_this_bin * w4 / count);\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // ROI_ALIGN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh",
    "content": "// Modified from\n// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated\n// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n\n#include <float.h>\n#ifdef MMCV_WITH_TRT\n#include \"common_cuda_helper.hpp\"\n#else  // MMCV_WITH_TRT\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else  // MMCV_USE_PARROTS\n#include \"pytorch_cuda_helper.hpp\"\n#endif  // MMCV_USE_PARROTS\n#endif  // MMCV_WITH_TRT\n\n/*** Forward ***/\ntemplate <typename scalar_t>\n__global__ void roi_align_rotated_forward_cuda_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_rois, const scalar_t spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, scalar_t *top_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    if (!aligned) {  // for backward-compatibility only\n      // Force malformed ROIs to be 1x1\n      roi_width = max(roi_width, (scalar_t)1.);\n      roi_height = max(roi_height, (scalar_t)1.);\n    }\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    const scalar_t *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosscalar_theta = cos(theta);\n    scalar_t sinscalar_theta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    scalar_t output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta (counterclockwise) around the center and translate\n        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;\n        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;\n\n        scalar_t val = bilinear_interpolate<scalar_t>(\n            offset_bottom_data, height, width, y, x, index);\n        output_val += val;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename scalar_t>\n__global__ void roi_align_rotated_backward_cuda_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,\n    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,\n    const bool clockwise, const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not round\n    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    if (!aligned) {  // for backward-compatibility only\n      // Force malformed ROIs to be 1x1\n      roi_width = max(roi_width, (scalar_t)1.);\n      roi_height = max(roi_height, (scalar_t)1.);\n    }\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    scalar_t *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels + c) * height * width;\n\n    int top_offset = (n * channels + c) * pooled_height * pooled_width;\n    const scalar_t *offset_top_diff = top_diff + top_offset;\n    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosTheta = cos(theta);\n    scalar_t sinTheta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta around the center and translate\n        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;\n        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;\n\n        scalar_t w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,\n                                                w4, x_low, x_high, y_low,\n                                                y_high, index);\n\n        scalar_t g1 = top_diff_this_bin * w1 / count;\n        scalar_t g2 = top_diff_this_bin * w2 / count;\n        scalar_t g3 = top_diff_this_bin * w3 / count;\n        scalar_t g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);\n          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);\n          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);\n          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);\n        }  // if\n      }  // ix\n    }  // iy\n  }  // CUDA_1D_KERNEL_LOOP\n}  // RoIAlignBackward\n\n#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_POOL_CUDA_KERNEL_CUH\n#define ROI_POOL_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void roi_pool_forward_cuda_kernel(\n    const int nthreads, const T* input, const T* rois, T* output, int* argmax,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    // calculate the roi region on feature maps\n    T roi_x1 = offset_rois[1] * spatial_scale;\n    T roi_y1 = offset_rois[2] * spatial_scale;\n    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;\n    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;\n\n    // force malformed rois to be 1x1\n    T roi_w = roi_x2 - roi_x1;\n    T roi_h = roi_y2 - roi_y1;\n    if (roi_w <= 0 || roi_h <= 0) continue;\n\n    T bin_size_w = roi_w / static_cast<T>(pooled_width);\n    T bin_size_h = roi_h / static_cast<T>(pooled_height);\n\n    // the corresponding bin region\n    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);\n    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);\n    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);\n    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);\n\n    // add roi offsets and clip to input boundaries\n    bin_x1 = min(max(bin_x1, 0), width);\n    bin_y1 = min(max(bin_y1, 0), height);\n    bin_x2 = min(max(bin_x2, 0), width);\n    bin_y2 = min(max(bin_y2, 0), height);\n    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n    // Define an empty pooling region to be zero\n    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd\n    T max_val = is_empty ? 0 : -FLT_MAX;\n    int max_idx = -1;\n    for (int h = bin_y1; h < bin_y2; ++h) {\n      for (int w = bin_x1; w < bin_x2; ++w) {\n        int offset = h * width + w;\n        if (offset_input[offset] > max_val) {\n          max_val = offset_input[offset];\n          max_idx = offset;\n        }\n      }\n    }\n    output[index] = max_val;\n    if (argmax != NULL) argmax[index] = max_idx;\n  }\n}\n\ntemplate <typename T>\n__global__ void roi_pool_backward_cuda_kernel(\n    const int nthreads, const T* grad_output, const T* rois, const int* argmax,\n    T* grad_input, const int pooled_height, const int pooled_width,\n    const int channels, const int height, const int width) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c) is an element in the pooled output\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    int roi_batch_ind = rois[n * 5];\n    T* grad_input_offset =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n    int argmax_index = argmax[index];\n\n    if (argmax_index != -1) {\n      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);\n    }\n  }\n}\n\n#endif  // ROI_POOL_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH\n#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,\n  // cz in the bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];\n  cz += z_size /\n        2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > z_size / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &\n                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,\n                                            int out_x, int out_y, int out_z,\n                                            const T *rois, const T *pts,\n                                            int *pts_mask) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,\n  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,\n  // y_idxs, z_idxs) by binary bit\n  int box_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (box_idx >= boxes_num) return;\n\n    pts += pt_idx * 3;\n    rois += box_idx * 7;\n    pts_mask += box_idx * pts_num + pt_idx;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);\n\n    pts_mask[0] = -1;\n    if (cur_in_flag > 0) {\n      T local_z = pts[2] - rois[2];\n      T x_size = rois[3], y_size = rois[4], z_size = rois[5];\n\n      T x_res = x_size / out_x;\n      T y_res = y_size / out_y;\n      T z_res = z_size / out_z;\n\n      unsigned int x_idx = int((local_x + x_size / 2) / x_res);\n      unsigned int y_idx = int((local_y + y_size / 2) / y_res);\n      unsigned int z_idx = int(local_z / z_res);\n\n      x_idx = min(max(x_idx, 0), out_x - 1);\n      y_idx = min(max(y_idx, 0), out_y - 1);\n      z_idx = min(max(z_idx, 0), out_z - 1);\n\n      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;\n\n      pts_mask[0] = idx_encoding;\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,\n                                             int max_pts_each_voxel, int out_x,\n                                             int out_y, int out_z,\n                                             const int *pts_mask,\n                                             T *pts_idx_of_voxels) {\n  // params pts_mask: (N, npoints)  0 or 1\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {\n    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;\n\n    for (int k = 0; k < pts_num; k++) {\n      if (pts_mask[box_idx * pts_num + k] != -1) {\n        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];\n        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;\n        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;\n        unsigned int z_idx = idx_encoding & 0xFF;\n        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +\n                                   y_idx * out_z * max_pts_each_voxel +\n                                   z_idx * max_pts_each_voxel;\n        unsigned int cnt = pts_idx_of_voxels[base_offset];\n        if (cnt < max_num_pts) {\n          pts_idx_of_voxels[base_offset + cnt + 1] = k;\n          pts_idx_of_voxels[base_offset]++;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,\n                                   int max_pts_each_voxel, int out_x, int out_y,\n                                   int out_z, const T *pts_feature,\n                                   const int *pts_idx_of_voxels,\n                                   T *pooled_features, int *argmax) {\n  // params pts_feature: (npoints, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),\n  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    pooled_features += box_idx * out_x * out_y * out_z * channels +\n                       offset_base * channels + channel_idx;\n    argmax += box_idx * out_x * out_y * out_z * channels +\n              offset_base * channels + channel_idx;\n\n    int argmax_idx = -1;\n    float max_val = -1e50;\n\n    int total_pts = pts_idx_of_voxels[0];\n\n    for (int k = 1; k <= total_pts; k++) {\n      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >\n          max_val) {\n        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];\n        argmax_idx = pts_idx_of_voxels[k];\n      }\n    }\n\n    if (argmax_idx != -1) {\n      pooled_features[0] = max_val;\n    }\n    argmax[0] = argmax_idx;\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,\n                                   int max_pts_each_voxel, int out_x, int out_y,\n                                   int out_z, const T *pts_feature,\n                                   const int *pts_idx_of_voxels,\n                                   T *pooled_features) {\n  // params pts_feature: (npoints, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),\n  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    pooled_features += box_idx * out_x * out_y * out_z * channels +\n                       offset_base * channels + channel_idx;\n\n    float sum_val = 0;\n    int total_pts = pts_idx_of_voxels[0];\n\n    for (int k = 1; k <= total_pts; k++) {\n      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];\n    }\n\n    if (total_pts > 0) {\n      pooled_features[0] = sum_val / total_pts;\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,\n                                            int out_x, int out_y, int out_z,\n                                            const int *argmax,\n                                            const T *grad_out, T *grad_in) {\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    argmax += box_idx * out_x * out_y * out_z * channels +\n              offset_base * channels + channel_idx;\n    grad_out += box_idx * out_x * out_y * out_z * channels +\n                offset_base * channels + channel_idx;\n\n    if (argmax[0] == -1) return;\n\n    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,\n                                            int out_x, int out_y, int out_z,\n                                            int max_pts_each_voxel,\n                                            const int *pts_idx_of_voxels,\n                                            const T *grad_out, T *grad_in) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    grad_out += box_idx * out_x * out_y * out_z * channels +\n                offset_base * channels + channel_idx;\n\n    int total_pts = pts_idx_of_voxels[0];\n    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);\n    for (int k = 1; k <= total_pts; k++) {\n      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,\n                grad_out[0] * cur_grad);\n    }\n  }\n}\n\n#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH\n#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the\n  // bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];\n  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > dz / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &\n              (local_y > -dy / 2.0) & (local_y < dy / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,\n                                    const T *xyz, const T *boxes3d,\n                                    int *pts_assign) {\n  // params xyz: (B, N, 3)\n  // params boxes3d: (B, M, 7)\n  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means\n  // background points\n  int box_idx = blockIdx.y;\n  int bs_idx = blockIdx.z;\n  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (box_idx >= boxes_num || bs_idx >= batch_size) return;\n\n    int assign_idx =\n        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;\n    pts_assign[assign_idx] = 0;\n\n    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;\n    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,\n                                        local_x, local_y);\n    pts_assign[assign_idx] = cur_in_flag;\n  }\n}\n\n__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,\n                               int sampled_pts_num, const int *pts_assign,\n                               int *pts_idx, int *pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params pts_feature: (B, N, C)\n  // params pts_assign: (B, N)\n  // params pts_idx: (B, M, 512)\n  // params pooled_empty_flag: (B, M)\n  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {\n    int bs_idx = blockIdx.y;\n\n    int cnt = 0;\n    for (int k = 0; k < pts_num; k++) {\n      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +\n                     boxes_idx]) {\n        if (cnt < sampled_pts_num) {\n          pts_idx[bs_idx * boxes_num * sampled_pts_num +\n                  boxes_idx * sampled_pts_num + cnt] = k;\n          cnt++;\n        } else\n          break;\n      }\n    }\n\n    if (cnt == 0) {\n      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;\n    } else if (cnt < sampled_pts_num) {\n      // duplicate same points for sampling\n      for (int k = cnt; k < sampled_pts_num; k++) {\n        int duplicate_idx = k % cnt;\n        int base_offset =\n            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;\n        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roipoint_pool3d_forward(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,\n    T *pooled_features, int *pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params pts_idx: (B, M, 512)\n  // params pts_feature: (B, N, C)\n  // params pooled_features: (B, M, 512, 3+C)\n  // params pooled_empty_flag: (B, M)\n  int box_idx = blockIdx.y;\n  int bs_idx = blockIdx.z;\n  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {\n    if (box_idx >= boxes_num || bs_idx >= batch_size) return;\n    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;\n\n    int temp_idx = bs_idx * boxes_num * sampled_pts_num +\n                   box_idx * sampled_pts_num + sample_pt_idx;\n    int src_pt_idx = pts_idx[temp_idx];\n    int dst_feature_offset = temp_idx * (3 + feature_in_len);\n\n    for (int j = 0; j < 3; j++)\n      pooled_features[dst_feature_offset + j] =\n          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];\n\n    int src_feature_offset =\n        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;\n    memcpy(pooled_features + dst_feature_offset + 3,\n           pts_feature + src_feature_offset, feature_in_len * sizeof(T));\n  }\n}\n\n#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu\n#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH\n#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename scalar_t>\n__global__ void rotated_feature_align_forward_kernel(\n    const int nthreads, const int points, const scalar_t* bottom_data,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width, scalar_t* top_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    const scalar_t* offset_bottom_data =\n        bottom_data + (n * channels + c) * height * width;\n\n    scalar_t output_val = bottom_data[index];\n    for (int i = 0; i < points; i++) {\n      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,\n                                                   width, py[i], px[i], i);\n    }\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void rotated_feature_align_backward_kernel(\n    const int nthreads, const int points, const scalar_t* top_diff,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width,\n    scalar_t* bottom_diff) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    scalar_t* offset_bottom_diff =\n        bottom_diff + (n * channels + c) * height * width;\n    scalar_t value_top_diff = top_diff[index];\n\n    atomicAdd(bottom_diff + index, value_top_diff);\n    for (int i = 0; i < points; i++) {\n      scalar_t w1, w2, w3, w4;\n      int x_low, x_high, y_low, y_high;\n\n      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,\n                                              w2, w3, w4, x_low, x_high, y_low,\n                                              y_high, i);\n      scalar_t g1 = value_top_diff * w1;\n      scalar_t g2 = value_top_diff * w2;\n      scalar_t g3 = value_top_diff * w3;\n      scalar_t g4 = value_top_diff * w4;\n      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);\n        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);\n        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);\n        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);\n      }\n    }\n  }\n}\n#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH\n#define SCATTER_POINTS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\nint const maxGridDim = 50000;\n\n__device__ __forceinline__ static void reduceMax(float *address, float val) {\n  int *address_as_i = reinterpret_cast<int *>(address);\n  int old = *address_as_i, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(address_as_i, assumed,\n                    __float_as_int(fmaxf(val, __int_as_float(assumed))));\n  } while (assumed != old || __int_as_float(old) < val);\n}\n\n__device__ __forceinline__ static void reduceMax(double *address, double val) {\n  unsigned long long *address_as_ull =\n      reinterpret_cast<unsigned long long *>(address);\n  unsigned long long old = *address_as_ull, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(\n        address_as_ull, assumed,\n        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));\n  } while (assumed != old || __longlong_as_double(old) < val);\n}\n\n// get rid of meaningless warnings when compiling host code\n#ifdef MMCV_WITH_HIP\n__device__ __forceinline__ static void reduceAdd(float *address, float val) {\n  atomicAdd(address, val);\n}\n__device__ __forceinline__ static void reduceAdd(double *address, double val) {\n  atomicAdd(address, val);\n}\n#else\n#ifdef __CUDA_ARCH__\n__device__ __forceinline__ static void reduceAdd(float *address, float val) {\n#if (__CUDA_ARCH__ < 200)\n#ifdef _MSC_VER\n#pragma message( \\\n    \"compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32\")\n#else\n#warning \\\n    \"compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32\"\n#endif\n  int *address_as_i = reinterpret_cast<int *>(address);\n  int old = *address_as_i, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(address_as_i, assumed,\n                    __float_as_int(val + __int_as_float(assumed)));\n  } while (assumed != old);\n#else\n  atomicAdd(address, val);\n#endif\n}\n\n__device__ __forceinline__ static void reduceAdd(double *address, double val) {\n#if (__CUDA_ARCH__ < 600)\n#ifdef _MSC_VER\n#pragma message( \\\n    \"compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64\")\n#else\n#warning \\\n    \"compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64\"\n#endif\n  unsigned long long *address_as_ull =\n      reinterpret_cast<unsigned long long *>(address);\n  unsigned long long old = *address_as_ull, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(address_as_ull, assumed,\n                    __double_as_longlong(val + __longlong_as_double(assumed)));\n  } while (assumed != old);\n#else\n  atomicAdd(address, val);\n#endif\n}\n#endif  // __CUDA_ARCH__\n#endif  // MMCV_WITH_HIP\n\ntemplate <typename T>\n__global__ void feats_reduce_kernel(\n    const T *feats, const int32_t *coors_map,\n    T *reduced_feats,  // shall be 0 at initialization\n    const int num_input, const int num_feats, const reduce_t reduce_type) {\n  CUDA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n    if (reduce_to == -1) continue;\n\n    const T *feats_offset = feats + x * num_feats;\n    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;\n    if (reduce_type == reduce_t::MAX) {\n      for (int i = 0; i < num_feats; i++) {\n        reduceMax(&reduced_feats_offset[i], feats_offset[i]);\n      }\n    } else {\n      for (int i = 0; i < num_feats; i++) {\n        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void add_reduce_traceback_grad_kernel(\n    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,\n    const int32_t *reduce_count, const int num_input, const int num_feats,\n    const reduce_t reduce_type) {\n  CUDA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n    if (reduce_to == -1) {\n      continue;\n    }\n\n    const int input_offset = x * num_feats;\n    T *grad_feats_offset = grad_feats + input_offset;\n    const int reduced_offset = reduce_to * num_feats;\n    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;\n\n    if (reduce_type == reduce_t::SUM) {\n      for (int i = 0; i < num_feats; i++) {\n        grad_feats_offset[i] = grad_reduced_feats_offset[i];\n      }\n    } else if (reduce_type == reduce_t::MEAN) {\n      for (int i = 0; i < num_feats; i++) {\n        grad_feats_offset[i] = grad_reduced_feats_offset[i] /\n                               static_cast<T>(reduce_count[reduce_to]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void max_reduce_traceback_scatter_idx_kernel(\n    const T *feats, const T *reduced_feats, int32_t *reduce_from,\n    const int32_t *coors_map, const int num_input, const int num_feats) {\n  CUDA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n\n    const int input_offset = x * num_feats;\n    const T *feats_offset = feats + input_offset;\n\n    if (reduce_to == -1) {\n      continue;\n    }\n\n    const int reduced_offset = reduce_to * num_feats;\n    const T *reduced_feats_offset = reduced_feats + reduced_offset;\n    int32_t *reduce_from_offset = reduce_from + reduced_offset;\n\n    for (int i = 0; i < num_feats; i++) {\n      if (feats_offset[i] == reduced_feats_offset[i]) {\n        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,\n                                               const T *grad_reduced_feats,\n                                               const int32_t *reduce_from,\n                                               const int num_reduced,\n                                               const int num_feats) {\n  CUDA_1D_KERNEL_LOOP(x, num_reduced) {\n    const int reduced_offset = x * num_feats;\n    const int32_t *scatter_to_offset = reduce_from + reduced_offset;\n    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;\n\n    for (int i = 0; i < num_feats; i++) {\n      grad_feats[scatter_to_offset[i] * num_feats + i] =\n          grad_reduced_feats_offset[i];\n    }\n  }\n}\n\n#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH\n#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void sigmoid_focal_loss_forward_cuda_kernel(\n    const int nthreads, const T* input, const int64_t* target, const T* weight,\n    T* output, const T gamma, const T alpha, const int num_classes) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n\n    int64_t t = target[n];\n    T flag_p = (t == c);\n    T flag_n = (t != c);\n\n    // p = sigmoid(x) = 1. / 1. + expf(-x)\n    T p = (T)1. / ((T)1. + expf(-input[index]));\n\n    // (1 - p)**gamma * log(p)\n    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));\n    // p**gamma * log(1 - p)\n    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));\n\n    output[index] = (T)0.;\n    output[index] += -flag_p * alpha * term_p;\n    output[index] += -flag_n * ((T)1. - alpha) * term_n;\n    if (weight != NULL) {\n      output[index] *= weight[t];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void sigmoid_focal_loss_backward_cuda_kernel(\n    const int nthreads, const T* input, const int64_t* target, const T* weight,\n    T* grad_input, const T gamma, const T alpha, const int num_classes) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n\n    int64_t t = target[n];\n    T flag_p = (t == c);\n    T flag_n = (t != c);\n\n    // p = sigmoid(x) = 1. / 1. + expf(-x)\n    T p = (T)1. / ((T)1. + exp(-input[index]));\n\n    // (1 - p)**gamma * (1 - p - gamma*p*log(p))\n    T term_p = pow(((T)1. - p), gamma) *\n               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));\n    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)\n    T term_n = pow(p, gamma) *\n               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);\n\n    grad_input[index] = (T)0.;\n    grad_input[index] += -flag_p * alpha * term_p;\n    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;\n    if (weight != NULL) {\n      grad_input[index] *= weight[t];\n    }\n  }\n}\n\n#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH\n#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_forward_cuda_kernel(\n    const int nthreads, const T* softmax, const int64_t* target,\n    const T* weight, T* output, const T gamma, const T alpha,\n    const int num_classes) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int64_t label = target[index];\n    T pred = softmax[index * num_classes + label];\n\n    if (label >= 0) {\n      output[index] =\n          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));\n    } else {\n      output[index] = 0;\n    }\n    if (weight != NULL) {\n      output[index] *= weight[label];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_backward_cuda1_kernel(\n    const int nthreads, const T* softmax, const int64_t* target,\n    const T* weight, T* buff, const T gamma, const T alpha,\n    const int num_classes) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int64_t label = target[index];\n    T pred = softmax[index * num_classes + label];\n\n    if (label >= 0) {\n      buff[index] = alpha * (-pow((T)1. - pred, gamma) +\n                             gamma * pow((T)1. - pred, gamma - 1) * pred *\n                                 log(max(pred, (T)FLT_MIN)));\n    } else {\n      buff[index] = 0;\n    }\n    if (weight != NULL) {\n      buff[index] *= weight[label];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_backward_cuda2_kernel(\n    const int nthreads, const T* softmax, const int64_t* target, const T* buff,\n    T* grad_input, const int num_classes) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n    int64_t label = target[n];\n\n    if (label >= 0) {\n      T flag = (label == c ? (T)1. : (T)0.);\n      grad_input[index] = buff[n] * (flag - softmax[index]);\n    } else {\n      grad_input[index] = 0;\n    }\n  }\n}\n\n#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/spconv/indice.cuh",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef INDICE_CU_H_\n#define INDICE_CU_H_\n#include <utils/spconv/spconv/geometry.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <utils/spconv/tensorview/helper_kernel.cuh>\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void prepareIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,\n    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  auto indicePairsDim2 = indicePairs.dim(2);\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n      indicePairs(offset, 0, oldNum) = ix;\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      indicePairs(offset, 1, oldNum) = index;\n      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void prepareDeConvIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,\n    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  auto indicePairsDim2 = indicePairs.dim(2);\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPosTranspose<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n      indicePairs(offset, 0, oldNum) = ix;\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      indicePairs(offset, 1, oldNum) = index;\n      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void assignGridAndIndiceOutKernel(\n    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,\n    int numAct, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {\n  Index index;\n  auto indicesOutPtr = indicesOut.data();\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    index = indicePairUnique[ix];\n    gridsOut[index] = ix;\n    index = tv::rowArrayIdxInv<Index, NDim>(\n        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());\n    indicesOut[ix * (NDim + 1)] = index % batchSize;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void assignIndicePairsKernel(\n    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,\n    int numActIn, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  Index index;\n  int kernelVolume = indicePairs.dim(0);\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    for (int i = 0; i < kernelVolume; ++i) {\n      index = indicePairs(i, 1, ix);\n      if (index > -1) {\n        indicePairs(i, 1, ix) = gridsOut[index];\n      }\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void prepareSubMGridKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index index = 0;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,\n                                         outSpatialShape.data()) +\n            spatialVolume * indicesIn(ix, 0);\n    gridsOut[index] = ix;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void getSubMIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,\n    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  Index index = 0;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (int i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      if (gridsOut[index] > -1) {\n        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n        indicePairs(offset, 1, oldNum) = gridsOut[index];\n        indicePairs(offset, 0, oldNum) = ix;\n      }\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void resetGridKernel(const Index *indicePairUnique,\n                                tv::TensorView<IndexGrid> gridsOut,\n                                int numAct) {\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    gridsOut[indicePairUnique[ix]] = -1;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void resetGridSubMKernel(\n    const Index *indices, tv::TensorView<IndexGrid> gridsOut,\n    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {\n  int outSpatialShapeReg[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShapeReg[i] = outSpatialShape[i];\n  }\n  Index spatialVolume = 1;\n  auto indsPtr = indices;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    indsPtr = indices + ix * (NDim + 1);\n    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);\n    gridsOut[index + spatialVolume * indsPtr[0]] = -1;\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/spconv/reordering.cuh",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef REORDERING_CU_H_\n#define REORDERING_CU_H_\n#include <utils/spconv/tensorview/helper_kernel.cuh>\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,\n                                    const Index *indices, int size,\n                                    int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size)\n          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =\n              features[inds[ilp] + iy];\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,\n                                const Index *indices, int size, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size)\n          reinterpret_cast<VecType *>(\n              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =\n              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType = int4>\n__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,\n                                     const Index *indices, int size,\n                                     int numPlanes) {\n  int ILPStrideY[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;\n  features += blockIdx.x * NumTLP;\n  buffer += blockIdx.x * NumTLP;\n\n  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      reinterpret_cast<VecType *>(\n          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =\n          reinterpret_cast<const VecType *>(\n              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +\n                        threadIdx.x];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void scatterAddGenericKernel(scalar_t *outFeatures,\n                                        const scalar_t *buffer,\n                                        const Index *indices, int size,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size) {\n          outFeatures[inds[ilp] + iy] +=\n              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType = int4>\n__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,\n                                         const scalar_t *buffer,\n                                         const Index *indices, int size,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;\n  outFeatures += blockIdx.x * NumTLP;\n  buffer += blockIdx.x * NumTLP;\n  scalar_t buf[vecloadFactor];\n  scalar_t buf2[vecloadFactor];\n  Index idx;\n  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(buf)[0] =\n          reinterpret_cast<VecType *>(outFeatures)[idx];\n      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(\n          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        buf[i] += buf2[i];\n      }\n      reinterpret_cast<VecType *>(outFeatures)[idx] =\n          reinterpret_cast<VecType *>(buf)[0];\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH\n#define STACK_BALL_QUERY_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void stack_ball_query_forward_cuda_kernel(\n    int B, int M, float radius, int nsample, const T *new_xyz,\n    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,\n    int *idx) {\n  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features\n  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]\n  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query\n  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]\n  // output:\n  //      idx: (M, nsample)\n  const T *cur_xyz = xyz;\n  int *cur_idx = idx;\n  CUDA_1D_KERNEL_LOOP(pt_idx, M) {\n    int bs_idx = 0;\n    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {\n      pt_cnt += new_xyz_batch_cnt[bs_idx];\n      if (pt_idx < pt_cnt) break;\n    }\n\n    int xyz_batch_start_idx = 0;\n    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];\n\n    const T *new_xyz_p = new_xyz + pt_idx * 3;\n    cur_xyz += xyz_batch_start_idx * 3;\n    cur_idx += pt_idx * nsample;\n\n    float radius2 = radius * radius;\n    T new_x = new_xyz_p[0];\n    T new_y = new_xyz_p[1];\n    T new_z = new_xyz_p[2];\n    int n = xyz_batch_cnt[bs_idx];\n\n    int cnt = 0;\n    for (int k = 0; k < n; ++k) {\n      T x = cur_xyz[k * 3 + 0];\n      T y = cur_xyz[k * 3 + 1];\n      T z = cur_xyz[k * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 < radius2) {\n        if (cnt == 0) {\n          for (int l = 0; l < nsample; ++l) {\n            cur_idx[l] = k;\n          }\n        }\n        cur_idx[cnt] = k;\n        ++cnt;\n        if (cnt >= nsample) break;\n      }\n    }\n    if (cnt == 0) cur_idx[0] = -1;\n  }\n}\n\n#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH\n#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include <stdio.h>\ntemplate <typename T>\n__global__ void stack_group_points_forward_cuda_kernel(\n    int b, int c, int m, int nsample, const T *features,\n    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,\n    T *out) {\n  // :param features: (N1 + N2 ..., C) tensor of features to group\n  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the\n  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor\n  // containing the indices of features to group with :param idx_batch_cnt:\n  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to\n  // group with :return:\n  //     output: (M1 + M2, C, nsample) tensor\n  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {\n    const T *cur_features = features;\n    const int *cur_idx = idx;\n    int sample_idx = index % nsample;\n    int c_idx = (index / nsample) % c;\n    int pt_idx = (index / nsample / c);\n\n    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;\n    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];\n    for (int k = 1; k < b; k++) {\n      if (pt_idx < pt_cnt) break;\n      pt_cnt += idx_batch_cnt[k];\n      bs_idx = k;\n    }\n\n    int features_batch_start_idx = 0;\n    int features_batch_end_idx = features_batch_cnt[0];\n    for (int k = 0; k < bs_idx; k++) {\n      features_batch_start_idx += features_batch_cnt[k];\n      features_batch_end_idx =\n          features_batch_start_idx + features_batch_cnt[k + 1];\n    }\n    cur_features += features_batch_start_idx * c;\n\n    cur_idx += pt_idx * nsample + sample_idx;\n    int in_idx = cur_idx[0] * c + c_idx;\n    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;\n    if (in_idx < features_batch_end_idx * c) {\n      out[out_idx] = cur_features[in_idx];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void stack_group_points_backward_cuda_kernel(\n    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,\n    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {\n  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the\n  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing\n  // the indices of features to group with :param idx_batch_cnt: (batch_size)\n  // [M1 + M2 ...] tensor containing the indices of features to group with\n  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the\n  // indices of features to group with :return:\n  //     grad_features: (N1 + N2 ..., C) gradient of the features\n  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {\n    const T *cur_grad_out = grad_out;\n    const int *cur_idx = idx;\n    T *cur_grad_features = grad_features;\n    int sample_idx = index % nsample;\n    int c_idx = (index / nsample) % c;\n    int pt_idx = (index / nsample / c);\n\n    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;\n\n    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];\n    for (int k = 1; k < b; k++) {\n      if (pt_idx < pt_cnt) break;\n      pt_cnt += idx_batch_cnt[k];\n      bs_idx = k;\n    }\n\n    int features_batch_start_idx = 0;\n    for (int k = 0; k < bs_idx; k++)\n      features_batch_start_idx += features_batch_cnt[k];\n\n    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;\n    cur_idx += pt_idx * nsample + sample_idx;\n    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;\n\n    atomicAdd(cur_grad_features, cur_grad_out[0]);\n  }\n}\n\n#endif  // GROUP_POINTS_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SYNCBN_CUDA_KERNEL_CUH\n#define SYNCBN_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,\n                                                 int num, int channels,\n                                                 int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer[tid] += input[index];\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    mean[c] = buffer[0] / total;\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,\n                                                 float *mean, int num,\n                                                 int channels, int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer[tid] += static_cast<float>(input[index]);\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    mean[c] = buffer[0] / total;\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_forward_var_cuda_kernel(const T *input,\n                                                const float *mean, float *var,\n                                                int num, int channels,\n                                                int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    float td = input[index] - mean[c];\n    buffer[tid] += td * td;\n  }\n  __syncthreads();\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    var[c] = buffer[0] / total;\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,\n                                                const float *mean, float *var,\n                                                int num, int channels,\n                                                int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    float td = static_cast<float>(input[index]) - mean[c];\n    buffer[tid] += td * td;\n  }\n  __syncthreads();\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    var[c] = buffer[0] / total;\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_forward_output_cuda_kernel(\n    const T *input, const float *mean, const float *var, float *running_mean,\n    float *running_var, const float *weight, const float *bias, float *norm,\n    float *std, T *output, int num, int channels, int spatial, float eps,\n    float momentum, int group_size) {\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  float mean_value = mean[c];\n  float std_value = sqrt(var[c] + eps);\n\n  if (weight != nullptr) {\n    float weight_value = weight[c];\n    float bias_value = bias[c];\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] = (input[index] - mean_value) / std_value;\n        output[index] = norm[index] * weight_value + bias_value;\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] =\n            (input[index] - mean_value) / std_value * weight_value + bias_value;\n      }\n    }\n  } else {\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = norm[index] = (input[index] - mean_value) / std_value;\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = (input[index] - mean_value) / std_value;\n      }\n    }\n  }\n  if (tid == 0) {\n    if (std != nullptr) std[c] = std_value;\n    if (running_mean != nullptr) {\n      running_mean[c] =\n          momentum * mean_value + (1 - momentum) * running_mean[c];\n      int count = num * spatial * group_size;\n      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];\n      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];\n    }\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_output_cuda_kernel(\n    const phalf *input, const float *mean, const float *var,\n    float *running_mean, float *running_var, const float *weight,\n    const float *bias, float *norm, float *std, phalf *output, int num,\n    int channels, int spatial, float eps, float momentum, int group_size) {\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  float mean_value = mean[c];\n  float std_value = sqrt(var[c] + eps);\n  if (weight != nullptr) {\n    float weight_value = weight[c];\n    float bias_value = bias[c];\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] =\n            (static_cast<float>(input[index]) - mean_value) / std_value;\n        output[index] =\n            static_cast<phalf>(norm[index] * weight_value + bias_value);\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] =\n            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /\n                                   std_value * weight_value +\n                               bias_value);\n      }\n    }\n  } else {\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] =\n            (static_cast<float>(input[index]) - mean_value) / std_value;\n        output[index] = static_cast<phalf>(norm[index]);\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = static_cast<phalf>(\n            (static_cast<float>(input[index]) - mean_value) / std_value);\n      }\n    }\n  }\n  if (tid == 0) {\n    if (std != nullptr) std[c] = std_value;\n    if (running_mean != nullptr) {\n      running_mean[c] =\n          momentum * mean_value + (1 - momentum) * running_mean[c];\n      int count = num * spatial * group_size;\n      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];\n      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,\n                                                   const float *norm,\n                                                   float *grad_weight,\n                                                   float *grad_bias, int num,\n                                                   int channels, int spatial) {\n  __shared__ float buffer1[THREADS_PER_BLOCK];\n  __shared__ float buffer2[THREADS_PER_BLOCK];\n\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer1[tid] = buffer2[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer1[tid] += grad_output[index] * norm[index];\n    buffer2[tid] += grad_output[index];\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer1[tid] += buffer1[tid + s];\n      buffer2[tid] += buffer2[tid + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    grad_weight[c] = buffer1[0];\n    grad_bias[c] = buffer2[0];\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,\n                                                   const float *norm,\n                                                   float *grad_weight,\n                                                   float *grad_bias, int num,\n                                                   int channels, int spatial) {\n  __shared__ float buffer1[THREADS_PER_BLOCK];\n  __shared__ float buffer2[THREADS_PER_BLOCK];\n\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer1[tid] = buffer2[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];\n    buffer2[tid] += static_cast<float>(grad_output[index]);\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer1[tid] += buffer1[tid + s];\n      buffer2[tid] += buffer2[tid + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    grad_weight[c] = buffer1[0];\n    grad_bias[c] = buffer2[0];\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_backward_data_cuda_kernel(\n    int output_size, const T *grad_output, const float *weight,\n    const float *grad_weight, const float *grad_bias, const float *norm,\n    const float *std, T *grad_input, int num, int channels, int spatial) {\n  int factor = num * spatial;\n  CUDA_1D_KERNEL_LOOP(index, output_size) {\n    int c = (index / spatial) % channels;\n    grad_input[index] =\n        weight[c] *\n        (grad_output[index] -\n         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /\n        std[c];\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_backward_data_cuda_kernel(\n    int output_size, const phalf *grad_output, const float *weight,\n    const float *grad_weight, const float *grad_bias, const float *norm,\n    const float *std, phalf *grad_input, int num, int channels, int spatial) {\n  int factor = num * spatial;\n  CUDA_1D_KERNEL_LOOP(index, output_size) {\n    int c = (index / spatial) % channels;\n    grad_input[index] = static_cast<phalf>(\n        weight[c] *\n        (static_cast<float>(grad_output[index]) -\n         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /\n        std[c]);\n  }\n}\n\n#endif  // SYNCBN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH\n#define THREE_INTERPOLATE_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void three_interpolate_forward_cuda_kernel(\n    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,\n    const T *weight, T *out) {\n  // points: (B, C, M)\n  // idx: (B, N, 3)\n  // weight: (B, N, 3)\n  // output:\n  //      out: (B, C, N)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    weight += bs_idx * n * 3 + pt_idx * 3;\n    points += bs_idx * c * m + c_idx * m;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n    out += bs_idx * c * n + c_idx * n;\n\n    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +\n                  weight[2] * points[idx[2]];\n  }\n}\n\ntemplate <typename T>\n__global__ void three_interpolate_backward_cuda_kernel(\n    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,\n    const T *weight, T *grad_points) {\n  // grad_out: (B, C, N)\n  // weight: (B, N, 3)\n  // output:\n  //      grad_points: (B, C, M)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    grad_out += bs_idx * c * n + c_idx * n + pt_idx;\n    weight += bs_idx * n * 3 + pt_idx * 3;\n    grad_points += bs_idx * c * m + c_idx * m;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n\n    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);\n    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);\n    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);\n  }\n}\n\n#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_NN_CUDA_KERNEL_CUH\n#define THREE_NN_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,\n                                             const T *unknown, const T *known,\n                                             T *dist2, int *__restrict__ idx) {\n  // unknown: (B, N, 3)\n  // known: (B, M, 3)\n  // output:\n  //      dist2: (B, N, 3)\n  //      idx: (B, N, 3)\n\n  int bs_idx = blockIdx.y;\n  CUDA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b) return;\n\n    unknown += bs_idx * n * 3 + pt_idx * 3;\n    known += bs_idx * m * 3;\n    dist2 += bs_idx * n * 3 + pt_idx * 3;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n\n    T ux = unknown[0];\n    T uy = unknown[1];\n    T uz = unknown[2];\n\n    double best1 = 1e40, best2 = 1e40, best3 = 1e40;\n    int besti1 = 0, besti2 = 0, besti3 = 0;\n    for (int k = 0; k < m; ++k) {\n      T x = known[k * 3 + 0];\n      T y = known[k * 3 + 1];\n      T z = known[k * 3 + 2];\n      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);\n      if (d < best1) {\n        best3 = best2;\n        besti3 = besti2;\n        best2 = best1;\n        besti2 = besti1;\n        best1 = d;\n        besti1 = k;\n      } else if (d < best2) {\n        best3 = best2;\n        besti3 = besti2;\n        best2 = d;\n        besti2 = k;\n      } else if (d < best3) {\n        best3 = d;\n        besti3 = k;\n      }\n    }\n    dist2[0] = best1;\n    dist2[1] = best2;\n    dist2[2] = best3;\n    idx[0] = besti1;\n    idx[1] = besti2;\n    idx[2] = besti3;\n  }\n}\n\n#endif  // THREE_NN_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef TIN_SHIFT_CUDA_KERNEL_CUH\n#define TIN_SHIFT_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntemplate <typename T>\n__global__ void tin_shift_forward_cuda_kernel(\n    const int nthreads, const T* input, const int* shift, T* output,\n    const int batch_size, const int channels, const int t_size,\n    const int hw_size, const int group_size, const int group_channel) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    const int hw_index = index % hw_size;\n    const int j = (index / hw_size) % channels;\n\n    const int n_index = (index / hw_size / channels) % batch_size;\n    int group_id = j / group_channel;\n    int t_shift = shift[n_index * group_size + group_id];\n    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;\n    for (int i = 0; i < t_size; i++) {\n      int now_t = i + t_shift;\n      int data_id = i * hw_size * channels + offset;\n      if (now_t < 0 || now_t >= t_size) {\n        continue;\n      }\n      int out_id = now_t * hw_size * channels + offset;\n      output[out_id] = input[data_id];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void tin_shift_backward_cuda_kernel(\n    const int nthreads, const T* input, const int* shift, T* output,\n    const int batch_size, const int channels, const int t_size,\n    const int hw_size, const int group_size, const int group_channel) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    const int hw_index = index % hw_size;\n    const int j = (index / hw_size) % channels;\n\n    const int n_index = (index / hw_size / channels) % batch_size;\n    int group_id = j / group_channel;\n    int t_shift = shift[n_index * group_size + group_id];\n    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;\n    for (int i = 0; i < t_size; i++) {\n      int now_t = i + t_shift;\n      int data_id = i * hw_size * channels + offset;\n      if (now_t < 0 || now_t >= t_size) {\n        continue;\n      }\n      int out_id = now_t * hw_size * channels + offset;\n      output[out_id] = input[data_id];\n    }\n  }\n}\n\n#endif  // TIN_SHIFT_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#ifndef VOXELIZATION_CUDA_KERNEL_CUH\n#define VOXELIZATION_CUDA_KERNEL_CUH\n\n#ifdef MMCV_USE_PARROTS\n#include \"parrots_cuda_helper.hpp\"\n#else\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\ntemplate <typename T, typename T_int>\n__global__ void dynamic_voxelize_kernel(\n    const T* points, T_int* coors, const float voxel_x, const float voxel_y,\n    const float voxel_z, const float coors_x_min, const float coors_y_min,\n    const float coors_z_min, const float coors_x_max, const float coors_y_max,\n    const float coors_z_max, const int grid_x, const int grid_y,\n    const int grid_z, const int num_points, const int num_features,\n    const int NDim) {\n  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n  CUDA_1D_KERNEL_LOOP(index, num_points) {\n    // To save some computation\n    auto points_offset = points + index * num_features;\n    auto coors_offset = coors + index * NDim;\n    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);\n    if (c_x < 0 || c_x >= grid_x) {\n      coors_offset[0] = -1;\n      continue;\n    }\n\n    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);\n    if (c_y < 0 || c_y >= grid_y) {\n      coors_offset[0] = -1;\n      coors_offset[1] = -1;\n      continue;\n    }\n\n    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);\n    if (c_z < 0 || c_z >= grid_z) {\n      coors_offset[0] = -1;\n      coors_offset[1] = -1;\n      coors_offset[2] = -1;\n    } else {\n      coors_offset[0] = c_z;\n      coors_offset[1] = c_y;\n      coors_offset[2] = c_x;\n    }\n  }\n}\n\ntemplate <typename T, typename T_int>\n__global__ void assign_point_to_voxel(const int nthreads, const T* points,\n                                      T_int* point_to_voxelidx,\n                                      T_int* coor_to_voxelidx, T* voxels,\n                                      const int max_points,\n                                      const int num_features,\n                                      const int num_points, const int NDim) {\n  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n    int index = thread_idx / num_features;\n\n    int num = point_to_voxelidx[index];\n    int voxelidx = coor_to_voxelidx[index];\n    if (num > -1 && voxelidx > -1) {\n      auto voxels_offset =\n          voxels + voxelidx * max_points * num_features + num * num_features;\n\n      int k = thread_idx % num_features;\n      voxels_offset[k] = points[thread_idx];\n    }\n  }\n}\n\ntemplate <typename T, typename T_int>\n__global__ void assign_voxel_coors(const int nthreads, T_int* coor,\n                                   T_int* point_to_voxelidx,\n                                   T_int* coor_to_voxelidx, T_int* voxel_coors,\n                                   const int num_points, const int NDim) {\n  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n    // if (index >= num_points) return;\n    int index = thread_idx / NDim;\n    int num = point_to_voxelidx[index];\n    int voxelidx = coor_to_voxelidx[index];\n    if (num == 0 && voxelidx > -1) {\n      auto coors_offset = voxel_coors + voxelidx * NDim;\n      int k = thread_idx % NDim;\n      coors_offset[k] = coor[thread_idx];\n    }\n  }\n}\n\ntemplate <typename T_int>\n__global__ void point_to_voxelidx_kernel(const T_int* coor,\n                                         T_int* point_to_voxelidx,\n                                         T_int* point_to_pointidx,\n                                         const int max_points,\n                                         const int max_voxels,\n                                         const int num_points, const int NDim) {\n  CUDA_1D_KERNEL_LOOP(index, num_points) {\n    auto coor_offset = coor + index * NDim;\n    // skip invalid points\n    if (coor_offset[0] == -1) continue;\n\n    int num = 0;\n    int coor_x = coor_offset[0];\n    int coor_y = coor_offset[1];\n    int coor_z = coor_offset[2];\n    // only calculate the coors before this coor[index]\n    for (int i = 0; i < index; ++i) {\n      auto prev_coor = coor + i * NDim;\n      if (prev_coor[0] == -1) continue;\n\n      // Find all previous points that have the same coors\n      // if find the same coor, record it\n      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&\n          (prev_coor[2] == coor_z)) {\n        num++;\n        if (num == 1) {\n          // point to the same coor that first show up\n          point_to_pointidx[index] = i;\n        } else if (num >= max_points) {\n          // out of boundary\n          break;\n        }\n      }\n    }\n    if (num == 0) {\n      point_to_pointidx[index] = index;\n    }\n    if (num < max_points) {\n      point_to_voxelidx[index] = num;\n    }\n  }\n}\n\ntemplate <typename T_int>\n__global__ void determin_voxel_num(\n    // const T_int* coor,\n    T_int* num_points_per_voxel, T_int* point_to_voxelidx,\n    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,\n    const int max_points, const int max_voxels, const int num_points) {\n  // only calculate the coors before this coor[index]\n  for (int i = 0; i < num_points; ++i) {\n    int point_pos_in_voxel = point_to_voxelidx[i];\n    // record voxel\n    if (point_pos_in_voxel == -1) {\n      // out of max_points or invalid point\n      continue;\n    } else if (point_pos_in_voxel == 0) {\n      // record new voxel\n      int voxelidx = voxel_num[0];\n      if (voxel_num[0] >= max_voxels) continue;\n      voxel_num[0] += 1;\n      coor_to_voxelidx[i] = voxelidx;\n      num_points_per_voxel[voxelidx] = 1;\n    } else {\n      int point_idx = point_to_pointidx[i];\n      int voxelidx = coor_to_voxelidx[point_idx];\n      if (voxelidx != -1) {\n        coor_to_voxelidx[i] = voxelidx;\n        num_points_per_voxel[voxelidx] += 1;\n      }\n    }\n  }\n}\n\n__global__ void nondeterministic_get_assign_pos(\n    const int nthreads, const int32_t* coors_map, int32_t* pts_id,\n    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {\n  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    int coors_idx = coors_map[thread_idx];\n    if (coors_idx > -1) {\n      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);\n      pts_id[thread_idx] = coors_pts_pos;\n      if (coors_pts_pos == 0) {\n        coors_order[coors_idx] = atomicAdd(coors_count, 1);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void nondeterministic_assign_point_voxel(\n    const int nthreads, const T* points, const int32_t* coors_map,\n    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,\n    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,\n    const int max_voxels, const int max_points, const int num_features,\n    const int NDim) {\n  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    int coors_idx = coors_map[thread_idx];\n    int coors_pts_pos = pts_id[thread_idx];\n    if (coors_idx > -1 && coors_pts_pos < max_points) {\n      int coors_pos = coors_order[coors_idx];\n      if (coors_pos < max_voxels) {\n        auto voxels_offset =\n            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;\n        auto points_offset = points + thread_idx * num_features;\n        for (int k = 0; k < num_features; k++) {\n          voxels_offset[k] = points_offset[k];\n        }\n        if (coors_pts_pos == 0) {\n          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);\n          auto coors_offset = coors + coors_pos * NDim;\n          auto coors_in_offset = coors_in + coors_idx * NDim;\n          for (int k = 0; k < NDim; k++) {\n            coors_offset[k] = coors_in_offset[k];\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // VOXELIZATION_CUDA_KERNEL_CUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#ifndef COMMON_MLU_HELPER_HPP_\n#define COMMON_MLU_HELPER_HPP_\n\n#define NFU_ALIGN_SIZE 128          // Byte\n#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc\n\n#ifdef __BANG_ARCH__\n#define MAX_NRAM_SIZE \\\n  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc\n#define MAX_SRAM_SIZE \\\n  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc\n#else\n#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value\n#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value\n#endif\n\n#ifndef PAD_UP\n#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))\n#endif\n\n#ifndef PAD_DOWN\n#define PAD_DOWN(x, y) (((x) / (y)) * (y))\n#endif\n\n#define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y))\n\ntemplate <typename scalar_t>\n__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {\n  return a < b ? a : b;\n}\n\ntemplate <typename scalar_t>\n__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {\n  return a > b ? a : b;\n}\n\n/*!\n * @brief Converts int32 to float32 data type.\n *\n * @param[out] dst\n *   Pointer to NRAM that stores int32 type data.\n * @param[in,out] dst_addition\n *   Pointer to NRAM as the workspace of dst, which has the same size as dst.\n *   It allows empty pointer on MLU300 series.\n * @param[in] src\n *   Pointer to NRAM that stores float32 type data.\n * @param[in,out] src_addition\n *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.\n *   It allows empty pointer on MLU300 series.\n * @param[in] src_count\n *   The count of elements in src.\n */\n__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,\n                                   float *src_addition, const int src_count) {\n#if __BANG_ARCH__ >= 300\n  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);\n#else\n  // get sign bit\n  const float move_23bit = 8388608.0;\n  // 0x80000000 = 1,000000000,0000000000000000000000000000\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x80000000);\n  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,\n                    src_count * sizeof(float), NFU_ALIGN_SIZE);\n  // get 1 or 0 from sign bit\n  // judg is Odd\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x00000001);\n  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,\n                   (char *)src_addition, src_count * sizeof(float),\n                   NFU_ALIGN_SIZE);\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x80000001);\n  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,\n                  NFU_ALIGN_SIZE / sizeof(float));\n  // minus xor, positive num invariant\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0xffffffff);\n  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,\n                   NFU_ALIGN_SIZE / sizeof(float));\n  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));\n  // convert int32 to float32\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x7fffff);\n  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,\n                    src_count * sizeof(float), NFU_ALIGN_SIZE);\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x4b000000);\n  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,\n                   src_count * sizeof(float), NFU_ALIGN_SIZE);\n  __bang_sub_scalar(dst, dst, move_23bit, src_count);\n  // add one\n  __bang_add(dst, dst, dst_addition, src_count);\n  // set sign for float32\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0xffffffff);\n  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,\n                   NFU_ALIGN_SIZE / sizeof(float));\n\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x00000001);\n  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,\n                   NFU_ALIGN_SIZE / sizeof(float));\n\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0x80000000);\n  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,\n                    (char *)src_addition, src_count * 4, 128);\n  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);\n#endif  // __BANG_ARCH__ >= 300\n}\n\n/*!\n * @brief Converts float32 to int32 data type with to_zero round mode.\n *\n * @param[out] dst\n *   Pointer to NRAM that stores float32 type data.\n * @param[in,out] dst_addition\n *   Pointer to NRAM as the workspace of dst, which has the same size as dst.\n *   It allows empty pointer on MLU300 series.\n * @param[in] src\n *   Pointer to NRAM that stores int32 type data.\n * @param[in,out] src_addition\n *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.\n *   It allows empty pointer on MLU300 series.\n * @param[in] src_count\n *   The count of elements in src.\n */\n__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,\n                                   float *src_addition, const int src_count) {\n#if __BANG_ARCH__ >= 300\n  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);\n#else\n  // sign ===> src_addition\n  // dst=-1.0 : when src[i] is a negative number\n  // dst=+1.0 : when src[i] is a positive number\n  const int floatDchar = sizeof(float) / sizeof(char);\n  __bang_active_sign((float *)dst, src, src_count);\n  // dst_addition = abs(src)\n  __bang_mul(dst_addition, src, (float *)dst, src_count);\n  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.\n  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     1.0f);\n  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,\n                  NFU_ALIGN_SIZE / sizeof(float));\n  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     0xbf800000);\n  // set negative flag -1.0 = 0xbf80000\n  __bang_cycle_eq(\n      (float *)dst, (float *)dst, (float *)src_addition, src_count,\n      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]\n  __bang_active_abs(dst_addition, src, src_count);\n  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     8388608.0f);\n  // mask shift move 23\n  __bang_cycle_add_tz(\n      dst_addition, dst_addition, src_addition, src_count,\n      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit\n  // two`s complement for negatibe\n  // dst=1.0 , when src <-1.0\n  // dst=0.0 , when src >=-1.0\n  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);\n  // to fix max value\n  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,\n  // means max value.\n  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);\n  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,\n              src_count * floatDchar);\n  // get low 23bit\n  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),\n                     (unsigned)0x007fffff);\n  // mask low 23bit is 1\n  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,\n                    (char *)src_addition, src_count * floatDchar,\n                    NFU_ALIGN_SIZE / sizeof(char));\n  // set 9 high bit ===> dst\n  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000\n  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000\n  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);\n  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,\n                   NFU_ALIGN_SIZE / sizeof(float));\n  // src or dst_addition\n  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,\n             src_count * floatDchar);\n  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);\n  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,\n             src_count * floatDchar);\n#endif  // __BANG_ARCH__ >= 300\n}\n\n/*!\n * @brief Converts float32 to half data type,\n * the rounding mode on MLU200 is rd, on MLU300 is rn.\n *\n * @param[out] dst\n *   Pointer to NRAM that stores half type data.\n * @param[in] src\n *   Pointer to NRAM that stores float32 type data.\n * @param[in] src_count\n *   The count of elements in src.\n */\n__mlu_func__ inline void convertFloat2half(half *dst, float *src,\n                                           int src_count) {\n#if __BANG_ARCH__ >= 300\n  __bang_float2half_rn(dst, src, src_count);\n#else\n  __bang_float2half_rd(dst, src, src_count);\n#endif\n}\n\n/*!\n * @brief recursiveSumPool.\n * @param[in,out] dst\n *     Pointer to NRAM that stores the input and output data.\n * @param[in] low_dim\n *     Which is the number of low dim.\n * @param[in] high_dim\n *     Which is the number of high dim.\n * @param[in] kernel_limit\n *     Which is the high_dim of sumpool per time.\n ******************************************************************************/\ntemplate <typename T>\n__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,\n                                   int kernel_limit) {\n  for (; high_dim > 1;) {\n    int repeat_s = high_dim / kernel_limit;\n    int remain_s = high_dim % kernel_limit;\n\n    if (remain_s) {\n      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,\n                     1);\n    }\n    if (repeat_s) {\n      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),\n                     (T *)dst + remain_s * low_dim, low_dim,\n                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,\n                     kernel_limit);\n    }\n    high_dim = repeat_s + (bool)remain_s;\n  }\n  return;\n}\n\n#endif  // COMMON_MLU_HELPER_HPP_\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"common_mlu_helper.hpp\"\n\n__nram__ char nram_buffer[MAX_NRAM_SIZE];\n\ntemplate <typename T>\n__mlu_func__ void MLUUnion1MaskedIm2colForward(\n    const T *feature, const int height, const int width, const int channels,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,\n    T *data_col) {\n  for (int index = taskId; index < mask_cnt; index += taskDim) {\n    const int h_col = mask_h_idx[index];\n    const int w_col = mask_w_idx[index];\n    const int h_offset = h_col - pad_h;\n    const int w_offset = w_col - pad_w;\n    int h_start = h_offset;\n    int h_end = h_offset + kernel_h - 1;\n    int w_start = w_offset;\n    int w_end = w_start + kernel_w - 1;\n    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {\n      continue;\n    } else {\n      int h_start_valid = max(0, h_start);\n      int h_end_valid = min(height - 1, h_end);\n      int w_start_valid = max(0, w_start);\n      int w_end_valid = min(width - 1, w_end);\n      __memcpy(\n          data_col + index * kernel_h * kernel_w * channels +\n              ((h_start_valid - h_start) * kernel_w +\n               (w_start_valid - w_start)) *\n                  channels,\n          feature + h_start_valid * width * channels + w_start_valid * channels,\n          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,\n          kernel_w * channels * sizeof(T), width * channels * sizeof(T),\n          h_end_valid - h_start_valid);\n    }\n  }\n}\n\ntemplate <typename T>\n__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,\n                                               const int width,\n                                               const int channels,\n                                               const int32_t *mask_h_idx,\n                                               const int32_t *mask_w_idx,\n                                               const int mask_cnt, T *im) {\n  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);\n  if (channels <= channels_max_num_nram) {\n    const int deal_num = channels_max_num_nram / channels;\n    int mask_per_core = mask_cnt / taskDim;\n    const int mask_remain = mask_cnt % taskDim;\n    mask_per_core += taskId < mask_remain ? 1 : 0;\n    int index_start = taskId < mask_remain\n                          ? taskId * mask_per_core\n                          : taskId * mask_per_core + mask_remain;\n    int loop = mask_per_core / deal_num;\n    int remain_num = mask_per_core % deal_num;\n    T *nram_col = (T *)nram_buffer;\n    for (int index = 0; index < loop; ++index) {\n      int cur_index = index_start + index * deal_num;\n      __memcpy(nram_col, col + cur_index * channels,\n               deal_num * channels * sizeof(T), GDRAM2NRAM);\n      for (int i = 0; i < deal_num; ++i) {\n        int mask_index = cur_index + i;\n        const int h_im = mask_h_idx[mask_index];\n        const int w_im = mask_w_idx[mask_index];\n        // if(h_im>=height || w_im>=width) continue;\n        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,\n                 channels * sizeof(T), NRAM2GDRAM);\n      }\n    }\n    if (remain_num > 0) {\n      int cur_index = index_start + loop * deal_num;\n      __memcpy(nram_col, col + cur_index * channels,\n               remain_num * channels * sizeof(T), GDRAM2NRAM);\n      for (int i = 0; i < remain_num; ++i) {\n        int mask_index = cur_index + i;\n        const int h_im = mask_h_idx[mask_index];\n        const int w_im = mask_w_idx[mask_index];\n        // if(h_im>=height || w_im>=width) continue;\n        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,\n                 channels * sizeof(T), NRAM2GDRAM);\n      }\n    }\n  } else {\n    for (int index = taskId; index < mask_cnt; index += taskDim) {\n      const int m_index = index % mask_cnt;\n      const int h_im = mask_h_idx[m_index];\n      const int w_im = mask_w_idx[m_index];\n      // if(h_im>=height || w_im>=width) continue;\n      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,\n               channels * sizeof(T), GDRAM2GDRAM);\n    }\n  }\n}\n\n__mlu_global__ void MLUKernelMaskedIm2colForward(\n    const void *feature, const int height, const int width, const int channels,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,\n    void *data_col, const cnrtDataType_t data_dtype) {\n  if (coreId == 0x80) {\n    return;\n  }\n\n  switch (data_dtype) {\n    case CNRT_FLOAT16: {\n      MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,\n                                   kernel_h, kernel_w, pad_h, pad_w,\n                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,\n                                   mask_cnt, (half *)data_col);\n    }; break;\n    case CNRT_FLOAT32: {\n      MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,\n                                   kernel_h, kernel_w, pad_h, pad_w,\n                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,\n                                   mask_cnt, (float *)data_col);\n    }; break;\n    default: {\n      break;\n    }\n  }\n}\n\n__mlu_global__ void MLUKernelMaskedCol2imForward(\n    const void *col, const int height, const int width, const int channels,\n    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,\n    void *im, const cnrtDataType_t data_dtype) {\n  if (coreId == 0x80) {\n    return;\n  }\n  switch (data_dtype) {\n    case CNRT_FLOAT16: {\n      MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,\n                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,\n                                   mask_cnt, (half *)im);\n    }; break;\n    case CNRT_FLOAT32: {\n      MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,\n                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,\n                                   mask_cnt, (float *)im);\n    }; break;\n    default: {\n      break;\n    }\n  }\n}\n\nvoid KernelMaskedIm2colForward(\n    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,\n    cnrtDataType_t k_dtype, const void *im_ptr, const int height,\n    const int width, const int channels, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,\n    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {\n  MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(\n      im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,\n      mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);\n}\n\nvoid KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                               cnrtQueue_t queue, cnrtDataType_t k_dtype,\n                               const void *col_ptr, const int height,\n                               const int width, const int channels,\n                               const void *mask_h_idx_ptr,\n                               const void *mask_w_idx_ptr, const int mask_cnt,\n                               void *im_ptr) {\n  MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(\n      col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,\n      mask_cnt, im_ptr, k_dtype);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"common_mlu_helper.hpp\"\n\n#define ALIGN_SIZE 64\n#define PIPELINE_COMMON_NUM 2\n#define PIPELINE_PINGPONG_NUM 10\n\n__nram__ char nram_buffer[MAX_NRAM_SIZE];\n\nnamespace forward {\ntemplate <typename T>\n__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,\n                                int width, int channels, int p_height,\n                                int p_width, T spatial_scale, int *bin_x1,\n                                int *bin_y1, int *bin_x2, int *bin_y2,\n                                int *bin_wdim, int *bin_hdim, int *bin_dims,\n                                T **input_base, bool *is_empty) {\n  int pw = bin_i % p_width;\n  int ph = (bin_i / p_width) % p_height;\n  int roi_n = bin_i / p_width / p_height;\n\n  /*roi*/\n  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}\n  int batch_index = (int)roi_info[0];\n  int roi_x1 = round(roi_info[1] * spatial_scale);\n  int roi_y1 = round(roi_info[2] * spatial_scale);\n  int roi_x2 = round(roi_info[3] * spatial_scale);\n  int roi_y2 = round(roi_info[4] * spatial_scale);\n  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;\n  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;\n\n  /*bin*/\n  T bin_w = (T)roi_w / (T)p_width;\n  T bin_h = (T)roi_h / (T)p_height;\n\n  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;\n  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;\n  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;\n\n  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;\n  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;\n  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;\n\n  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;\n  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;\n  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;\n\n  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;\n  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;\n  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;\n\n  *input_base = input_v + batch_index * height * width * channels;\n  *bin_wdim = *bin_x2 - *bin_x1;\n  *bin_hdim = *bin_y2 - *bin_y1;\n  *bin_dims = (*bin_hdim) * (*bin_wdim);\n  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);\n}\n\ntemplate <typename T>\n__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,\n                                   int channels, int height, int width,\n                                   int p_height, int p_width, int rois_num,\n                                   T spatial_scale, T *output_v, int *argmax) {\n  /*\n   * NRAM partition\n   *  |---------------------------------------------------|\n   *  |                        ping                       |\n   *  |---------------------------------------------------|\n   *  |                        pong                       |\n   *  |---------------------------------------------------|\n   *  |                        out                        |\n   *  |---------------------------------------------------|\n   *  |                        argmax                     |\n   *  |---------------------------------------------------|\n   *  |                        a                          |\n   *  |---------------------------------------------------|\n   *  |                        b                          |\n   *  |---------------------------------------------------|\n   */\n  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;\n  uint32_t t_size = sizeof(T);\n  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);\n  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);\n\n  uint32_t channels_align = PAD_UP(channels, float_div);\n  uint32_t nram_limit = PAD_DOWN(\n      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);\n\n  // nram PING/PONG, output, argamx, a, b\n  float *nram_ping = (float *)nram_buffer;\n  float *nram_pong = (float *)nram_buffer + nram_limit;\n  float *nram_out = (float *)nram_buffer + 2 * nram_limit;\n  float *nram_argmax = nram_out + channels_align;\n  float *nram_a = nram_out + 2 * channels_align;\n  float *nram_b = nram_out + 3 * channels_align;\n\n  uint32_t c_bins_num = rois_num * p_height * p_width;\n  uint32_t task_bins = c_bins_num / taskDim;\n  uint32_t rem_bins = c_bins_num % taskDim;\n  if (taskId < rem_bins) {\n    task_bins += 1;\n  }\n  int bin_first =\n      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);\n  int bins_loop = bin_first + task_bins;\n\n  T *input_base = NULL;\n  T *output_base = output_v + bin_first * channels;\n  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;\n  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;\n  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;\n  bool is_empty = false;\n  bool pong_is_empty = false;\n  bool is_first_bin = true;\n  uint32_t src_offset = 0;\n  uint32_t dst_offset = 0;\n  uint32_t nram_offset = 0;\n  uint32_t half_offset =\n      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;\n  float *nram_tmp = NULL;\n\n  uint32_t c_slice = 0;\n  uint32_t c_slice_align = 0;\n  uint32_t pongc_slice = 0;\n  uint32_t pongc_slice_align = 0;\n  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {\n    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,\n                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,\n                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,\n                  &input_base, &is_empty);\n    uint32_t c_rem = channels;\n    c_slice = nram_limit / bin_dims / float_div * float_div;\n\n    if (is_first_bin && !is_empty) {\n      c_slice = c_slice > c_rem ? c_rem : c_slice;\n      c_slice_align = PAD_UP(c_slice, float_div);\n      for (int h = bin_y1; h < bin_y2; h++) {\n        src_offset = (h * width + bin_x1) * channels;\n        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;\n        if (c_slice_align == channels) {\n          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,\n                   bin_wdim * c_slice * t_size, GDRAM2NRAM);\n        } else {\n          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,\n                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,\n                   channels * t_size, bin_wdim - 1);\n        }\n      }\n    }\n    uint32_t c_offset = 0;\n    while (c_rem > 0) {\n      c_slice = c_slice > c_rem ? c_rem : c_slice;\n      c_slice_align = PAD_UP(c_slice, float_div);\n\n      /*__memcpy_async*/\n      if (c_rem - c_slice > 0 && !is_empty) {\n        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;\n        pongc_slice_align = PAD_UP(pongc_slice, float_div);\n        for (int h = bin_y1; h < bin_y2; h++) {\n          src_offset = (h * width + bin_x1) * channels + c_offset;\n          nram_offset =\n              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;\n          __memcpy_async((T *)nram_pong + nram_offset,\n                         (T *)input_base + src_offset + c_slice,\n                         pongc_slice * t_size, GDRAM2NRAM,\n                         pongc_slice_align * t_size, channels * t_size,\n                         bin_wdim - 1);\n        }\n      } else if (bin_i + 1 < bins_loop) {\n        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,\n                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,\n                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,\n                      &pbin_dims, &input_base, &pong_is_empty);\n        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);\n        pongc_slice = pongc_slice > channels ? channels : pongc_slice;\n        pongc_slice_align = PAD_UP(pongc_slice, float_div);\n        if (!pong_is_empty) {\n          for (int h = pbin_y1; h < pbin_y2; h++) {\n            src_offset = (h * width + pbin_x1) * channels;\n            nram_offset =\n                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;\n            if (pongc_slice_align == channels) {\n              __memcpy_async((T *)nram_pong + nram_offset,\n                             (T *)input_base + src_offset,\n                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);\n            } else {\n              __memcpy_async((T *)nram_pong + nram_offset,\n                             (T *)input_base + src_offset, pongc_slice * t_size,\n                             GDRAM2NRAM, pongc_slice_align * t_size,\n                             channels * t_size, pbin_wdim - 1);\n            }\n          }\n        }\n      }\n\n      if (is_empty) {\n        __bang_write_value((T *)nram_out, c_slice_align, (T)0);\n        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,\n                 c_slice * t_size, NRAM2GDRAM);\n        if (NULL != argmax) {\n          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));\n          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,\n                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);\n        }\n      } else {\n        if (is_half) {\n          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);\n          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,\n                            bin_align64);\n        }\n        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,\n                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);\n        if (is_half) {\n          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);\n          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);\n        }\n        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,\n                 c_slice * t_size, NRAM2GDRAM);\n        if (NULL != argmax) {\n          /*compute max_index*/\n          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,\n                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,\n                               bin_wdim, 1, 1);\n          convertInt2Float((float *)nram_argmax, (float *)nram_a,\n                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);\n\n          /*compute input_h*/\n          for (int i = 0; i < c_slice; i++) {\n            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);\n          }\n          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,\n                            c_slice_align);\n          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,\n                            c_slice_align);\n\n          /*compute input_w*/\n          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,\n                            c_slice_align);\n          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,\n                     c_slice_align);\n          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,\n                            c_slice_align);\n          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,\n                     c_slice_align);\n          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,\n                           (float *)nram_out, (float *)nram_b, c_slice_align);\n          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,\n                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),\n                   NRAM2GDRAM);\n        }\n      }\n      nram_tmp = nram_ping;\n      nram_ping = nram_pong;\n      nram_pong = nram_tmp;\n      c_offset += c_slice;\n      c_rem -= c_slice;\n      __asm__ volatile(\"sync;\");\n    }\n    dst_offset += channels;\n    is_first_bin = false;\n  }\n}\n\n__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,\n                                     const void *input_data,\n                                     const void *input_rois, int batch,\n                                     int channels, int height, int width,\n                                     int pooled_height, int pooled_width,\n                                     int rois_num, float spatial_scale,\n                                     void *output_data, int *argmax) {\n  switch (data_type) {\n    case CNRT_FLOAT16: {\n      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,\n                       height, width, pooled_height, pooled_width, rois_num,\n                       (half)spatial_scale, (half *)output_data, argmax);\n    }; break;\n    case CNRT_FLOAT32: {\n      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,\n                       channels, height, width, pooled_height, pooled_width,\n                       rois_num, (float)spatial_scale, (float *)output_data,\n                       argmax);\n    }; break;\n    default: { break; }\n  }\n}\n}  // namespace forward\n\nnamespace backward {\n// Convert index of argmax from global grads_image to local bin in RoI. Vector\n// operations do not support int type, so conversion from int to float is\n// performed here.\n__mlu_func__ void convertIndex(\n    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,\n    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,\n    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,\n    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,\n    float *nram_atomic_add, float *nram_grads_image, int width, int height,\n    int wstart, int hstart, int w_compute, int h_compute, int align_c,\n    int channels, int loop_flag, int loop_id, int true_limit) {\n  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,\n                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);\n\n  // This step uses scalar division, because the above vector division causes\n  // rounding accuracy problem.\n  for (int i = 0; i < channels; ++i) {\n    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;\n  }\n\n  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'\n  // operation.\n  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,\n                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,\n                   align_c);\n  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,\n                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,\n                   align_c);\n\n  // Perform 'temp_result - hstart' operation\n  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,\n                    align_c);\n\n  // Perform 'temp_result1 - temp_result2 * width' operation\n  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,\n                    align_c);\n  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,\n                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);\n  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,\n             (float *)nram_argmax_fp_w, align_c);\n\n  // Perform 'temp_result - wstart' operation\n  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,\n                    wstart, align_c);\n\n  // Perform 'temp_result = h * w_compute + w' operation\n  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,\n                    w_compute, align_c);\n  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,\n             (float *)nram_argmax_fp_w, align_c);\n\n  if (loop_flag == 1) {\n    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,\n                      (loop_id * true_limit), align_c);\n  }\n  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,\n                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,\n                   align_c);\n}\n\ntemplate <typename T>\n__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,\n                                   const int32_t *argmax, T *grads_image,\n                                   int channels, int height, int width,\n                                   int pooled_height, int pooled_width,\n                                   int rois_num, const T spatial_scale,\n                                   int high_precision) {\n  // Calculate the number of rois processed by each core\n  int bin_num = rois_num * pooled_height * pooled_width;\n  int loop =\n      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);\n  int tid = taskId * loop;\n  if (bin_num % taskDim != 0) {\n    if (tid >= bin_num) {\n      return;\n    } else {\n      // last part is (bin_num - tid).\n      loop = bin_num - tid < loop ? bin_num - tid : loop;\n    }\n  }\n  int align_c = PAD_UP(channels, ALIGN_SIZE);\n  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.\n  int data_size =\n      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -\n                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /\n                2),\n               ALIGN_SIZE);\n  int hw_limit = data_size / align_c;\n  float *nram_grads = (float *)nram_buffer;\n  for (int idx = tid; idx < tid + loop; ++idx) {\n    // (n, ph, pw) is a C in the pooled output\n    int pw = idx % pooled_width;\n    int ph = (idx / pooled_width) % pooled_height;\n    int n = idx / pooled_width / pooled_height;\n\n    const T *offset_rois = (const T *)(rois + n * 5);\n    int roi_batch_ind = int(offset_rois[0]);\n    // Calculate the roi region on feature maps\n    int roi_start_w = round(offset_rois[1] * spatial_scale);\n    int roi_start_h = round(offset_rois[2] * spatial_scale);\n    int roi_end_w = round(offset_rois[3] * spatial_scale);\n    int roi_end_h = round(offset_rois[4] * spatial_scale);\n    // Force malformed rois to 1x1\n    int roi_width =\n        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;\n    int roi_height =\n        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;\n    T bin_size_h = (T)roi_height / (T)pooled_height;\n    T bin_size_w = (T)roi_width / (T)pooled_width;\n\n    // The corresponding bin region\n    int hstart = int(floor((T)ph * bin_size_h));\n    int wstart = int(floor((T)pw * bin_size_w));\n    int hend = int(ceil((T)(ph + 1) * bin_size_h));\n    int wend = int(ceil((T)(pw + 1) * bin_size_w));\n\n    // Add roi offsets and clip to input boundaries, min(max(A, B), C);\n    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;\n    hstart = hstart < height ? hstart : height;\n    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;\n    hend = hend < height ? hend : height;\n    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;\n    wstart = wstart < width ? wstart : width;\n    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;\n    wend = wend < width ? wend : width;\n\n    bool is_empty = (hend <= hstart) || (wend <= wstart);\n    if (!is_empty) {\n      int h_compute = hend - hstart;\n      int w_compute = wend - wstart;\n      int true_limit =\n          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;\n      int loop_int = (h_compute * w_compute) / true_limit;\n      int rem = (h_compute * w_compute) % true_limit;\n      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;\n      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;\n      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;\n      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;\n      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;\n      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;\n      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;\n      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;\n      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;\n      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;\n      float *nram_grads_image = (float *)nram_atomic_add + align_c;\n      if (true_limit == h_compute * w_compute) {\n        /*\n         * NRAM partition\n         *  |---------------------------------------------------|\n         *  |                     grads                         |\n         *  |---------------------------------------------------|\n         *  |                     argmax                        |\n         *  |---------------------------------------------------|\n         *  |                     argmax_temp                   |\n         *  |---------------------------------------------------|\n         *  |                     atomic_add                    |\n         *  |---------------------------------------------------|\n         *  |                     grads_image                   |\n         *  |---------------------------------------------------|\n         */\n\n        // Load the data from GDRAM to NRAM.\n        __memcpy(\n            (T *)nram_grads + align_c * high_precision,\n            (const T *)grads +\n                (n * pooled_height * pooled_width + ph * pooled_width + pw) *\n                    channels,\n            channels * sizeof(T), GDRAM2NRAM);\n        if (high_precision) {\n          __bang_half2float((float *)nram_grads,\n                            (half *)nram_grads + align_c * high_precision,\n                            align_c);\n        }\n\n        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +\n                                             (n * pooled_height * pooled_width +\n                                              ph * pooled_width + pw) *\n                                                 channels,\n                 channels * sizeof(int32_t), GDRAM2NRAM);\n\n        // Perform pooling operation on NRAM.\n        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,\n                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,\n                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,\n                     nram_atomic_add, nram_grads_image, width, height, wstart,\n                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);\n        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,\n                          (int32_t *)nram_argmax_int, align_c, h_compute,\n                          w_compute, h_compute, w_compute, h_compute,\n                          w_compute);\n        if (high_precision) {\n          __bang_float2half_rd((half *)nram_grads_image,\n                               (float *)nram_grads_image,\n                               h_compute * w_compute * align_c);\n        }\n\n        // Store the result on NRAM back to GDRAM.\n        for (int hc = 0; hc < h_compute; ++hc) {\n          for (int wc = 0; wc < w_compute; ++wc) {\n            T *dst = (T *)nram_atomic_add;\n            int grad_image_offset = (roi_batch_ind * height * width +\n                                     (hc + hstart) * width + wc + wstart) *\n                                    channels;\n            T *src1 = (T *)grads_image + grad_image_offset;\n            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;\n            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;\n            __bang_atomic_add(dst, src1, src2, channels);\n          }\n        }\n      } else if (true_limit > 0) {\n        /*\n         * NRAM partition\n         *  |---------------------------------------------------|\n         *  |                     grads                         |\n         *  |---------------------------------------------------|\n         *  |                     argmax                        |\n         *  |--------------------ping_pong----------------------|\n         *  |       argmax_temp      |       argmax_temp        |\n         *  |------------------------|--------------------------|\n         *  |       atomic_add       |       atomic_add         |\n         *  |------------------------|--------------------------|\n         *  |       grads_image      |       grads_image        |\n         *  |---------------------------------------------------|\n         */\n\n        // Load the data from GDRAM to NRAM.\n        __memcpy(\n            (T *)nram_grads + align_c * high_precision,\n            (const T *)grads +\n                (n * pooled_height * pooled_width + ph * pooled_width + pw) *\n                    channels,\n            channels * sizeof(T), GDRAM2NRAM);\n        if (high_precision) {\n          __bang_half2float((float *)nram_grads,\n                            (half *)nram_grads + align_c * high_precision,\n                            align_c);\n        }\n        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +\n                                             (n * pooled_height * pooled_width +\n                                              ph * pooled_width + pw) *\n                                                 channels,\n                 channels * sizeof(int32_t), GDRAM2NRAM);\n\n        int ping_pong = 0;\n        int ping_pong_offset =\n            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;\n        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {\n          int size = (loop_id == loop_int) ? rem : true_limit;\n          if (size == 0) {\n            break;\n          }\n          // Perform pooling operation on NRAM.\n          nram_argmax_fp =\n              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;\n          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;\n          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;\n          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;\n          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;\n          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;\n          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;\n          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;\n          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;\n          nram_grads_image = (float *)nram_atomic_add + align_c;\n          int loop_id_1 = loop_id;\n          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;\n          if (size_1 == 0) {\n            break;\n          }\n          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,\n                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,\n                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,\n                       nram_atomic_add, nram_grads_image, width, height, wstart,\n                       hstart, w_compute, h_compute, align_c, channels, 1,\n                       loop_id_1, true_limit);\n          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,\n                            (int32_t *)nram_argmax_int, align_c, size_1, 1,\n                            size_1, 1, size_1, 1);\n          if (high_precision) {\n            __bang_float2half_rd((half *)nram_grads_image,\n                                 (float *)nram_grads_image, size_1 * align_c);\n          }\n\n          // Store the result on NRAM back to GDRAM.\n          for (int index_size = 0; index_size < size; ++index_size) {\n            int h = (loop_id * true_limit + index_size) / w_compute;\n            int w = (loop_id * true_limit + index_size) % w_compute;\n            T *dst = (T *)nram_atomic_add;\n            T *grads_image_n =\n                (T *)grads_image + roi_batch_ind * height * width * channels;\n            T *src1 = (T *)grads_image_n +\n                      ((h + hstart) * width + (w + wstart)) * channels;\n            T *src2 = (T *)nram_grads_image + index_size * align_c;\n            __bang_atomic_add(dst, src1, src2, channels);\n          }\n          ping_pong = 1 - ping_pong;\n        }\n      } else {\n        /*\n         * NRAM partition\n         *  |---------------------------------------------------|\n         *  |                     grads                         |\n         *  |---------------------------------------------------|\n         *  |                     argmax                        |\n         *  |--------------------ping_pong----------------------|\n         *  |       argmax_temp      |       argmax_temp        |\n         *  |------------------------|--------------------------|\n         *  |       atomic_add       |       atomic_add         |\n         *  |------------------------|--------------------------|\n         *  |       grads_image      |       grads_image        |\n         *  |---------------------------------------------------|\n         */\n\n        int c_limit =\n            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /\n                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),\n                     ALIGN_SIZE);\n        int loop_int = channels / c_limit;\n        int rem = channels % c_limit;\n        int ping_pong = 0;\n        int ping_pong_offset =\n            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;\n        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {\n          int size = (loop_id == loop_int) ? rem : c_limit;\n          if (size == 0) {\n            break;\n          }\n          nram_argmax_fp =\n              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;\n          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;\n          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;\n          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;\n          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;\n          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;\n          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;\n          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;\n          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;\n          nram_grads_image = (float *)nram_atomic_add + c_limit;\n\n          // This pipeline loads the data from GDRAM to NRAM.\n          __memcpy((T *)nram_grads + c_limit * high_precision,\n                   (const T *)grads +\n                       n * pooled_height * pooled_width * channels +\n                       ph * pooled_width * channels + pw * channels +\n                       loop_id * c_limit,\n                   size * sizeof(T), GDRAM2NRAM);\n          if (high_precision) {\n            __bang_half2float((float *)nram_grads,\n                              (half *)nram_grads + c_limit * high_precision,\n                              c_limit);\n          }\n          __memcpy((int32_t *)nram_argmax,\n                   (const int32_t *)argmax +\n                       n * pooled_height * pooled_width * channels +\n                       ph * pooled_width * channels + pw * channels +\n                       loop_id * c_limit,\n                   size * sizeof(int32_t), GDRAM2NRAM);\n\n          for (int hc = 0; hc < h_compute; ++hc) {\n            for (int wc = 0; wc < w_compute; ++wc) {\n              // This pipeline performs pooling operation on NRAM.\n              convertIndex(\n                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,\n                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,\n                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,\n                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,\n                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);\n              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,\n                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,\n                                1, 1);\n              if (high_precision) {\n                __bang_float2half_rd((half *)nram_grads_image,\n                                     (float *)nram_grads_image, c_limit);\n              }\n              // This pipeline stores the result on NRAM back to GDRAM.\n              T *dst = (T *)nram_atomic_add;\n              T *grads_image_n =\n                  (T *)grads_image + roi_batch_ind * height * width * channels;\n              T *src1 = (T *)grads_image_n +\n                        ((hc + hstart) * width + (wc + wstart)) * channels +\n                        loop_id * c_limit;\n              T *src2 = (T *)nram_grads_image;\n              __bang_atomic_add(dst, src1, src2, size);\n            }\n          }\n          ping_pong = 1 - ping_pong;\n        }\n      }\n    }\n  }\n}\n\n__mlu_global__ void MLUKernelRoiPoolBackward(\n    const void *grads, const void *rois, const int *argmax, void *grads_image,\n    int rois_num, int pooled_height, int pooled_width, int channels, int no,\n    int height, int width, const float spatial_scale,\n    const cnrtDataType_t k_dtype) {\n  // make sure that memcore is not used\n  if (coreId == 0x80) {\n    return;\n  }\n  switch (k_dtype) {\n    case CNRT_FLOAT16: {\n      // Using the float type '__bang_max_pool_bp' instruction to increase the\n      // bit width.\n      const int high_precision = 1;\n      MLUUnion1Roipool((const half *)rois, (const half *)grads,\n                       (const int32_t *)argmax, (half *)grads_image, channels,\n                       height, width, pooled_height, pooled_width, rois_num,\n                       (const half)spatial_scale, high_precision);\n    }; break;\n    case CNRT_FLOAT32: {\n      const int high_precision = 0;\n      MLUUnion1Roipool((const float *)rois, (const float *)grads,\n                       (const int32_t *)argmax, (float *)grads_image, channels,\n                       height, width, pooled_height, pooled_width, rois_num,\n                       (const float)spatial_scale, high_precision);\n    }; break;\n    default: { break; }\n  }\n}\n}  // namespace backward\n\nvoid KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                          cnrtQueue_t queue, cnrtDataType_t data_type,\n                          const void *input_data, const void *input_rois,\n                          const int batch, const int channels, const int height,\n                          const int width, const int pooled_height,\n                          const int pooled_width, const int rois_num,\n                          const float spatial_scale, void *output_data,\n                          int *argmax) {\n  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(\n      data_type, input_data, input_rois, batch, channels, height, width,\n      pooled_height, pooled_width, rois_num, spatial_scale, output_data,\n      argmax);\n}\n\nvoid KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                           cnrtQueue_t queue, cnrtDataType_t k_dtype,\n                           const void *grad_output_ptr, const void *rois_ptr,\n                           const int *argmax_ptr, void *grad_input_ptr,\n                           const int box_num, const int pooled_height,\n                           const int pooled_width, const int channels,\n                           const int batch, const int height, const int width,\n                           const float spatial_scale) {\n  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(\n      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,\n      pooled_height, pooled_width, channels, batch, height, width,\n      spatial_scale, k_dtype);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mps/MPSDevice.h",
    "content": "//  Copyright © 2022 Apple Inc.\n\n// This file is modify from:\n// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h\n\n#pragma once\n#include <ATen/ATen.h>\n#include <c10/macros/Macros.h>\n#include <c10/util/Exception.h>\n\n#ifdef __OBJC__\n#include <Foundation/Foundation.h>\n#include <Metal/Metal.h>\n#include <MetalPerformanceShaders/MetalPerformanceShaders.h>\ntypedef id<MTLDevice> MTLDevice_t;\n#else\ntypedef void* MTLDevice;\ntypedef void* MTLDevice_t;\n#endif\n\nusing namespace std;\n\nnamespace at {\nnamespace mps {\n\n//-----------------------------------------------------------------\n//  MPSDevice\n//\n// MPSDevice is a singleton class that returns the default device\n//-----------------------------------------------------------------\n\nclass TORCH_API MPSDevice {\n public:\n  /**\n   * MPSDevice should not be cloneable.\n   */\n  MPSDevice(MPSDevice& other) = delete;\n  /**\n   * MPSDevice should not be assignable.\n   */\n  void operator=(const MPSDevice&) = delete;\n  /**\n   * Gets single instance of the Device.\n   */\n  static MPSDevice* getInstance();\n  /**\n   * Returns the single device.\n   */\n  MTLDevice_t device() { return _mtl_device; }\n\n  ~MPSDevice();\n\n private:\n  static MPSDevice* _device;\n  MTLDevice_t _mtl_device;\n  MPSDevice();\n};\n\nTORCH_API bool is_available();\n\nTORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);\n\n}  // namespace mps\n}  // namespace at\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mps/MPSLibrary.h",
    "content": "#ifndef _MPS_LIBRARY_H_\n#define _MPS_LIBRARY_H_\n\n#include <string>\n#include <unordered_map>\n\n#ifdef __OBJC__\n#include <Foundation/Foundation.h>\n#include <Metal/Metal.h>\n#include <MetalPerformanceShaders/MetalPerformanceShaders.h>\n\ntypedef id<MTLComputePipelineState> MTLComputePipelineState_t;\ntypedef id<MTLLibrary> MTLLibrary_t;\n#else\ntypedef void* MTLComputePipelineState;\ntypedef void* MTLComputePipelineState_t;\ntypedef void* MTLLibrary;\ntypedef void* MTLLibrary_t;\n#endif\n\nclass MPSLibrary {\n public:\n  // disable constructor for singleton\n  static MPSLibrary* createFromUrl(const std::string& library_url);\n  static MPSLibrary* createFromSource(const std::string& source);\n  ~MPSLibrary();\n\n  MTLLibrary_t library() { return _library; }\n\n  MTLComputePipelineState_t getComputePipelineState(\n      const std::string& function_name);\n\n private:\n  MTLLibrary_t _library;\n  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;\n};\n\nclass MPSLibraryManager {\n public:\n  // disable constructor for singleton\n  MPSLibraryManager(const MPSLibraryManager&) = delete;\n  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;\n  MPSLibraryManager(MPSLibraryManager&&) = delete;\n  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;\n\n  static MPSLibraryManager* getInstance();\n\n  bool hasLibrary(const std::string& name);\n\n  MPSLibrary* getLibrary(const std::string& library_url);\n\n  MPSLibrary* createLibraryFromSouce(const std::string& name,\n                                     const std::string& sources);\n\n  ~MPSLibraryManager();\n\n private:\n  MPSLibraryManager();\n  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;\n};\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mps/MPSLibrary.mm",
    "content": "#include \"MPSLibrary.h\"\n#include \"MPSDevice.h\"\n\nstatic std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;\n\nMPSLibraryManager* MPSLibraryManager::getInstance() {\n  if(!mps_library_manager)\n    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());\n  return mps_library_manager.get();\n}\n\nMPSLibraryManager::~MPSLibraryManager() {}\n\nMPSLibraryManager::MPSLibraryManager() {}\n\nbool MPSLibraryManager::hasLibrary(const std::string& name) {\n  return _library_map.find(name) != _library_map.end();\n}\n\nMPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {\n  if (_library_map.find(library_url) != _library_map.end()) {\n    return _library_map[library_url].get();\n  }\n  _library_map.emplace(std::make_pair(\n      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));\n  return _library_map[library_url].get();\n}\n\nMPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,\n                                                      const std::string& source) {\n  NSString* ns_name = [NSString stringWithCString:name.c_str()];\n  if (_library_map.find(name) != _library_map.end()) {\n    NSLog(@\"Library %@ already exist.\", ns_name);\n    return nullptr;\n  }\n\n  _library_map.emplace(\n      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));\n  return _library_map[name].get();\n}\n\nMPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {\n  MPSLibrary* library = new MPSLibrary();\n  @autoreleasepool {\n    NSError* error = nil;\n\n    // load library and func\n    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];\n    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];\n    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url\n                                                                                 error:&error];\n    if (library->_library == nil) {\n      NSLog(@\"Failed to find library, error %@.\", error);\n      exit(1);\n    }\n  }\n\n  return library;\n}\n\nMPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {\n  MPSLibrary* library = new MPSLibrary();\n  @autoreleasepool {\n    NSError* error = nil;\n\n    // load library and func\n    NSString* code_str = [NSString stringWithCString:sources.c_str()];\n    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str\n                                                                                  options:nil\n                                                                                    error:&error];\n    if (library->_library == nil) {\n      NSLog(@\"Failed to find library, error %@.\", error);\n      exit(1);\n    }\n  }\n\n  return library;\n}\n\nMPSLibrary::~MPSLibrary() {\n  [_library release];\n  _library = nil;\n}\n\nMTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {\n  if (_pso_map.find(function_name) != _pso_map.end()) {\n    return _pso_map[function_name];\n  }\n\n  MTLComputePipelineState_t pso;\n  @autoreleasepool {\n    NSError* error = nil;\n\n    // create function\n    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];\n    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];\n    if (func == nil) {\n      NSLog(@\"Failed to created pipeline state object, error %@.\", error);\n      exit(1);\n    }\n    // create pipeline\n    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func\n                                                                                     error:&error];\n    _pso_map.emplace(std::make_pair(function_name, pso));\n  }\n  return _pso_map[function_name];\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mps/MPSStream.h",
    "content": "//  Copyright © 2022 Apple Inc.\n\n// This file is modify from:\n// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h\n\n#pragma once\n\n#include <cstdint>\n#include <utility>\n\n#include <c10/core/DeviceGuard.h>\n#include <c10/core/Stream.h>\n#include <c10/util/Exception.h>\n#include \"MPSDevice.h\"\n\n#ifdef __OBJC__\n#include <Foundation/Foundation.h>\n#include <Metal/Metal.h>\n#include <MetalPerformanceShaders/MetalPerformanceShaders.h>\n#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>\ntypedef id<MTLCommandQueue> MTLCommandQueue_t;\ntypedef id<MTLCommandBuffer> MTLCommandBuffer_t;\ntypedef id<MTLSharedEvent> MTLSharedEvent_t;\ntypedef id<MTLDevice> MTLDevice_t;\n#else\ntypedef void* MTLCommandQueue_t;\ntypedef void* MTLCommandQueue;\ntypedef void* MTLCommandBuffer_t;\ntypedef void* MTLCommandBuffer;\ntypedef void* MTLSharedEvent_t;\ntypedef void* dispatch_queue_t;\ntypedef void* MTLDevice_t;\n#define nil NULL;\n#endif\n\nnamespace at {\nnamespace mps {\n\n//-----------------------------------------------------------------\n//  MPSStream\n//-----------------------------------------------------------------\n\nclass TORCH_API MPSStream {\n public:\n  enum Unchecked { UNCHECKED };\n  /// Construct a MPSStream from a Stream.  This construction is checked,\n  /// and will raise an error if the Stream is not, in fact, a MPS stream.\n  explicit MPSStream(Stream stream);\n\n  ~MPSStream();\n  MTLCommandQueue_t commandQueue() const { return _commandQueue; };\n  dispatch_queue_t queue() const { return _serialQueue; }\n\n  MTLCommandBuffer_t commandBuffer();\n  void commit(bool flush);\n  void commitAndWait();\n  void synchronize();\n\n  void flush();\n\n  /// Get the MPS device index that this stream is associated with.\n  c10::DeviceIndex device_index() const { return _stream.device_index(); }\n\n  MTLCommandQueue_t stream() const { return _commandQueue; };\n\n  MTLDevice_t device() const { return [_commandQueue device]; }\n\n  /// Explicit conversion to Stream.\n  Stream unwrap() const { return _stream; }\n\n private:\n  Stream _stream;\n  MTLCommandQueue_t _commandQueue = nil;\n  MTLCommandBuffer_t _commandBuffer = nil;\n  void _flush(bool commitAndWait) const;\n\n  dispatch_queue_t _serialQueue = nullptr;\n};\n\n/**\n * Get the current MPS stream\n */\nTORCH_API MPSStream* getCurrentMPSStream();\n\n/**\n * Get the default MPS stream\n */\nTORCH_API MPSStream* getDefaultMPSStream();\n\n//-----------------------------------------------------------------\n//  MPSStreamImpl\n//-----------------------------------------------------------------\n\nclass TORCH_API MPSStreamImpl {\n public:\n  /**\n   * Gets single instance of the MPSStream.\n   */\n  static MPSStream* getInstance();\n\n private:\n  static MPSStream* _stream;\n  MPSStreamImpl();\n};\n\n//-----------------------------------------------------------------\n//  MPSEvent\n//-----------------------------------------------------------------\n\nstruct TORCH_API MPSEvent {\n  MPSEvent();\n  // MPSEvent(id<MTLDevice> device);\n\n  ~MPSEvent();\n  MTLSharedEvent_t event() const { return _event; }\n\n  void recordEvent(MPSStream* stream);\n  void waitForEvent(MPSStream* queue);  // waits on the cpu\n  bool queryEvent();\n  uint64_t getCurrentValue() { return _currentValue; }\n  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }\n\n private:\n  bool _isRecorded = false;\n  uint64_t _currentValue = 0;\n  MTLSharedEvent_t _event;\n};\n\ntypedef MPSEvent* mpsEvent_t;\n\n}  // namespace mps\n}  // namespace at\n"
  },
  {
    "path": "mmcv/ops/csrc/common/mps/MPSUtils.h",
    "content": "#ifndef _MPS_UTILS_H_\n#define _MPS_UTILS_H_\n#include <torch/extension.h>\n#ifdef __OBJC__\n#include <Foundation/Foundation.h>\n#include <Metal/Metal.h>\n#include <MetalPerformanceShaders/MetalPerformanceShaders.h>\n\ntypedef id<MTLBuffer> MTLBuffer_t;\ntypedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;\n#else\ntypedef void* MTLBuffer;\ntypedef void* MTLBuffer_t;\ntypedef void* MTLComputeCommandEncoder;\ntypedef void* MTLComputeCommandEncoder_t;\n#endif\n\n// utils\nstatic inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {\n  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());\n}\n\ntemplate <typename T,\n          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>\nvoid setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);\n\ntemplate <typename T,\n          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>\nvoid setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {\n  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];\n}\n\ntemplate <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>\nvoid setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {\n  [encoder setBytes:&t length:sizeof(t) atIndex:index];\n}\n\ninline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}\n\ntemplate <typename T, typename... Args>\nvoid setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {\n  setMTLArg(encoder, index, std::forward<T>(t));\n  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);\n}\n\ntemplate <typename... Args>\nvoid setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {\n  [encoder setComputePipelineState:pso];\n  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);\n}\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/active_rotated_filter_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu\n#ifndef ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH\n#define ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename scalar_t>\n__global__ void active_rotated_filter_forward_musa_kernel(\n    const int nthreads, const scalar_t* weight_data, const int* indices_data,\n    const int num_input_planes, const int num_output_planes,\n    const int num_orientations, const int num_rotations, const int nEntry,\n    scalar_t* output_data) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int l = index % nEntry;\n    int j = (index / nEntry) % num_input_planes;\n    int i = index / nEntry / num_input_planes;\n    int k;\n    scalar_t val = *(weight_data + index);\n    for (k = 0; k < num_rotations; k++) {\n      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;\n      scalar_t* target = output_data +\n                         i * (num_rotations * num_input_planes * nEntry) +\n                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;\n      *target = val;\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void active_rotated_filter_backward_musa_kernel(\n    const int nthreads, const scalar_t* gradWeight_data,\n    const int* indices_data, const int num_input_planes,\n    const int num_output_planes, const int num_orientations,\n    const int num_rotations, const int nEntry, scalar_t* weight_data) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int l = index % nEntry;\n    int j = (index / nEntry) % num_input_planes;\n    int i = index / nEntry / num_input_planes;\n    int k;\n    scalar_t* val = weight_data + index;\n    *val = 0;\n    scalar_t tmp = 0;\n    for (k = 0; k < num_rotations; k++) {\n      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;\n      scalar_t target =\n          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +\n            k * (num_input_planes * nEntry) + j * (nEntry) + idx);\n      tmp = tmp + target;\n    }\n    *val = tmp;\n  }\n}\n#endif  // ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/assign_score_withk_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH\n#define ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\n\n// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)\n// output: fout(B,O,N)\n// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)\n//       i(k) = idx(b,i,k)\n//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)\n//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k\n//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))\n\ntemplate <typename T>\n__global__ void assign_score_withk_forward_musa_kernel(\n    const int B, const int N0, const int N1, const int M, const int K,\n    const int O, const int aggregate, const T* points, const T* centers,\n    const T* scores, const int64_t* knn_idx, T* output) {\n  // ----- parallel loop for B, N1, K and O ---------\n  MUSA_1D_KERNEL_LOOP(i, B * O * N1 * K) {\n    // ------- loop for M ----------\n    const int b = (int)(i / (O * N1 * K));\n    const int o = (int)(i % (O * N1 * K) / (N1 * K));\n    const int n = (int)(i % (N1 * K) / K);\n    const int k = (int)(i % K);\n    const int cn = (int)knn_idx[b * K * N1 + n * K +\n                                0];  // The first neighbor is the center point\n    const int kn = (int)knn_idx[b * K * N1 + n * K + k];\n    if (kn >= N0 ||\n        kn < 0) {  // if index overflows, it is out of the neighborhood range\n      return;\n    }\n    assert(b < B);\n    assert(kn < N0);\n    assert(cn < N0);\n    assert(o < O);\n    assert(n < N1);\n    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;\n    T val = output[out_idx];\n    for (int m = 0; m < M; m++) {\n      val += points[b * N0 * M * O + kn * M * O + m * O + o] *\n                 scores[b * N1 * K * M + n * K * M + k * M + m] -\n             centers[b * N0 * M * O + cn * M * O + m * O + o] *\n                 scores[b * N1 * K * M + n * K * M + k * M + m];\n    }\n    output[out_idx] = val;\n  }\n}\n\ntemplate <typename T>\n__global__ void assign_score_withk_points_backward_musa_kernel(\n    const int B, const int N0, const int N, const int M, const int K,\n    const int O, const int aggregate, const T* grad_out, const T* scores,\n    const int64_t* knn_idx, T* grad_points, T* grad_centers) {\n  // ----- parallel loop for B, M, O ---------\n  MUSA_1D_KERNEL_LOOP(i, B * M * O) {\n    int b = (int)(i / (M * O));\n    int m = (int)(i % (M * O) / O);\n    int o = (int)(i % O);\n\n    // ----- loop for N,K ---------\n    for (int n = 0; n < N; n++) {\n      for (int k = 0; k < K; k++) {\n        int kn = knn_idx[b * N * K + n * K + k];\n        int cn = knn_idx[b * N * K + n * K + 0];\n        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the\n                                   // neighborhood range\n          continue;\n        }\n        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,\n                  scores[b * N * K * M + n * K * M + k * M + m] *\n                      grad_out[b * O * N * K + o * N * K + n * K + k]);\n        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,\n                  -scores[b * N * K * M + n * K * M + k * M + m] *\n                      grad_out[b * O * N * K + o * N * K + n * K + k]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void assign_score_withk_scores_backward_musa_kernel(\n    const int B, const int N0, const int N, const int M, const int K,\n    const int O, const int aggregate, const T* grad_out, const T* points,\n    const T* centers, const int64_t* knn_idx, T* grad_scores) {\n  // ----- parallel loop for B, N, K, M ---------\n  MUSA_1D_KERNEL_LOOP(i, B * N * K * M) {\n    const int b = (int)(i / (N * M * K));\n    const int n = (int)(i % (N * M * K) / M / K);\n    const int k = (int)(i % (M * K) / M);\n    const int m = (int)(i % M);\n    const int cn = knn_idx[b * N * K + n * K + 0];\n    const int kn = knn_idx[b * N * K + n * K + k];\n    if (kn >= N0 ||\n        kn < 0) {  // if index overflows, it is out of the neighborhood range\n      return;\n    }\n\n    // -------------- loop for O ------------------------\n    const int out_idx = b * N * K * M + n * K * M + k * M + m;\n    T val = grad_scores[out_idx];\n    for (int o = 0; o < O; o++) {\n      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -\n              centers[b * N0 * M * O + cn * M * O + m * O + o]) *\n             grad_out[b * O * N * K + o * N * K + n * K + k];\n    }\n    grad_scores[out_idx] = val;\n  }\n}\n\n#endif  // ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/ball_query_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n#ifndef BALL_QUERY_MUSA_KERNEL_MUH\n#define BALL_QUERY_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void ball_query_forward_musa_kernel(int b, int n, int m,\n                                               float min_radius,\n                                               float max_radius, int nsample,\n                                               const T* new_xyz, const T* xyz,\n                                               int* idx) {\n  // new_xyz: (B, M, 3)\n  // xyz: (B, N, 3)\n  // output:\n  //      idx: (B, M, nsample)\n  int bs_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b) return;\n\n    new_xyz += bs_idx * m * 3 + pt_idx * 3;\n    xyz += bs_idx * n * 3;\n    idx += bs_idx * m * nsample + pt_idx * nsample;\n\n    float max_radius2 = max_radius * max_radius;\n    float min_radius2 = min_radius * min_radius;\n    T new_x = new_xyz[0];\n    T new_y = new_xyz[1];\n    T new_z = new_xyz[2];\n\n    int cnt = 0;\n    for (int k = 0; k < n; ++k) {\n      T x = xyz[k * 3 + 0];\n      T y = xyz[k * 3 + 1];\n      T z = xyz[k * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {\n        if (cnt == 0) {\n          for (int l = 0; l < nsample; ++l) {\n            idx[l] = k;\n          }\n        }\n        idx[cnt] = k;\n        ++cnt;\n        if (cnt >= nsample) break;\n      }\n    }\n  }\n}\n\n#endif  // BALL_QUERY_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/bbox_overlaps_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BBOX_OVERLAPS_MUSA_KERNEL_MUH\n#define BBOX_OVERLAPS_MUSA_KERNEL_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,\n                                          T& y1, T& x2, T& y2) {\n  x1 = bbox[base];\n  y1 = bbox[base + 1];\n  x2 = bbox[base + 2];\n  y2 = bbox[base + 3];\n}\n\ntemplate <>\n__device__ __forceinline__ void load_bbox<float>(const float* bbox,\n                                                 const int base, float& x1,\n                                                 float& y1, float& x2,\n                                                 float& y2) {\n  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];\n  x1 = bbox_offset.x;\n  y1 = bbox_offset.y;\n  x2 = bbox_offset.z;\n  y2 = bbox_offset.w;\n}\n\ntemplate <typename T>\n__global__ void bbox_overlaps_musa_kernel(const T* bbox1, const T* bbox2,\n                                          T* ious, const int num_bbox1,\n                                          const int num_bbox2, const int mode,\n                                          const bool aligned,\n                                          const int offset) {\n  if (aligned) {\n    MUSA_1D_KERNEL_LOOP(index, num_bbox1) {\n      const int b1 = index;\n      const int b2 = index;\n\n      const int base1 = b1 << 2;  // b1 * 4\n      T b1_x1, b1_y1, b1_x2, b1_y2;\n      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);\n\n      const int base2 = b2 << 2;  // b2 * 4\n      T b2_x1, b2_y1, b2_x2, b2_y2;\n      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);\n\n      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);\n      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);\n      const T width = fmaxf(right - left + offset, 0.f);\n      const T height = fmaxf(bottom - top + offset, 0.f);\n      const T interS = width * height;\n\n      const T baseS =\n          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));\n      ious[index] = interS / baseS;\n    }\n  } else {\n    MUSA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {\n      const int b1 = index / num_bbox2;\n      const int b2 = index % num_bbox2;\n\n      const int base1 = b1 << 2;  // b1 * 4\n      T b1_x1, b1_y1, b1_x2, b1_y2;\n      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);\n\n      const int base2 = b2 << 2;  // b2 * 4\n      T b2_x1, b2_y1, b2_x2, b2_y2;\n      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);\n\n      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);\n      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);\n      const T width = fmaxf(right - left + offset, 0.f);\n      const T height = fmaxf(bottom - top + offset, 0.f);\n      const T interS = width * height;\n\n      const T baseS =\n          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));\n      ious[index] = interS / baseS;\n    }\n  }\n}\n\n__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,\n                                              const __half x2, const __half y2,\n                                              const __half offset) {\n  const __half half_w = __hadd(__hsub(x2, x1), offset);\n  const __half half_h = __hadd(__hsub(y2, y1), offset);\n  return __hmul(half_w, half_h);\n}\n\n__device__ __forceinline__ __half __half_max(const __half a, const __half b) {\n  return __hge(a, b) ? a : b;\n}\n\n__device__ __forceinline__ __half __half_min(const __half a, const __half b) {\n  return __hle(a, b) ? a : b;\n}\n\n// fp16 won't provide much increase when aligned==true. It is useful when\n// aligned==false, which would give you ~40% bonus.\n__device__ void bbox_overlaps_musa_kernel_half(\n    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,\n    const int num_bbox2, const int mode, const bool aligned, const int offset) {\n  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;\n  const __half h_offset = __int2half_rn(offset);\n  MUSA_1D_KERNEL_LOOP(index, num_output) {\n    const int b1 = aligned ? index : index / num_bbox2;\n    const int b2 = aligned ? index : index % num_bbox2;\n\n    const int base1 = b1 << 2;\n    __half b1_x1, b1_y1, b1_x2, b1_y2;\n    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);\n    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);\n\n    const int base2 = b2 << 2;\n    __half b2_x1, b2_y1, b2_x2, b2_y2;\n    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);\n    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);\n\n    const __half left = __half_max(b1_x1, b2_x1),\n                 right = __half_min(b1_x2, b2_x2);\n    const __half top = __half_max(b1_y1, b2_y1),\n                 bottom = __half_min(b1_y2, b2_y2);\n    const __half width =\n        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));\n    const __half height =\n        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));\n    const __half interS = __hmul(width, height);\n\n    const __half baseS = __half_max(\n        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,\n        h_offset);\n    ious[index] = __hdiv(interS, baseS);\n  }\n}\n\n#endif  // BBOX_OVERLAPS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/bezier_align_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu\n#ifndef BEZIER_ALIGN_MUSA_KERNEL_MUH\n#define BEZIER_ALIGN_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,\n                          const T u) {\n  return ((1. - u) * (1. - u) * (1. - u) * p0 +\n          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +\n          u * u * u * p3);\n}\n\ntemplate <typename T>\n__global__ void bezier_align_forward_musa_kernel(\n    const int nthreads,\n    const T *bottom_data,  // inputs\n    const T *bottom_rois,  // bottom rois contains the bezier curve\n    T *top_data,           // outputs\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int sampling_ratio, bool aligned, const int channels,\n    const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    // beziers have size Nx(1+8*2) = Nx17\n    const T *offset_bottom_rois = bottom_rois + n * 17;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n\n    // TODO: avoid this by using parallel annotation, for good\n    T p0_x = offset_bottom_rois[1] * spatial_scale;\n    T p0_y = offset_bottom_rois[2] * spatial_scale;\n    T p1_x = offset_bottom_rois[3] * spatial_scale;\n    T p1_y = offset_bottom_rois[4] * spatial_scale;\n    T p2_x = offset_bottom_rois[5] * spatial_scale;\n    T p2_y = offset_bottom_rois[6] * spatial_scale;\n    T p3_x = offset_bottom_rois[7] * spatial_scale;\n    T p3_y = offset_bottom_rois[8] * spatial_scale;\n    T p4_x = offset_bottom_rois[15] * spatial_scale;\n    T p4_y = offset_bottom_rois[16] * spatial_scale;\n    T p5_x = offset_bottom_rois[13] * spatial_scale;\n    T p5_y = offset_bottom_rois[14] * spatial_scale;\n    T p6_x = offset_bottom_rois[11] * spatial_scale;\n    T p6_y = offset_bottom_rois[12] * spatial_scale;\n    T p7_x = offset_bottom_rois[9] * spatial_scale;\n    T p7_y = offset_bottom_rois[10] * spatial_scale;\n\n    // compute the coords\n    const T u = pw / static_cast<T>(pooled_width);\n    const T v = ph / static_cast<T>(pooled_height);\n    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n    const T x_center = x1 * v + x0 * (1. - v) - offset;\n    const T y_center = y1 * v + y0 * (1. - v) - offset;\n\n    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));\n    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    // When the grid is empty, output zeros == 0/1, instead of NaN.\n    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    T output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1\n    {\n      const T y = y_center - (T)0.5 * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = x_center - (T)0.5 * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,\n                                     index);\n        output_val += val;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename T>\n__global__ void bezier_align_backward_musa_kernel(\n    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int sampling_ratio, bool aligned, const int channels,\n    const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    // beziers have size Nx(1+8*2) = Nx17\n    const T *offset_bottom_rois = bottom_rois + n * 17;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T p0_x = offset_bottom_rois[1] * spatial_scale;\n    T p0_y = offset_bottom_rois[2] * spatial_scale;\n    T p1_x = offset_bottom_rois[3] * spatial_scale;\n    T p1_y = offset_bottom_rois[4] * spatial_scale;\n    T p2_x = offset_bottom_rois[5] * spatial_scale;\n    T p2_y = offset_bottom_rois[6] * spatial_scale;\n    T p3_x = offset_bottom_rois[7] * spatial_scale;\n    T p3_y = offset_bottom_rois[8] * spatial_scale;\n    T p4_x = offset_bottom_rois[15] * spatial_scale;\n    T p4_y = offset_bottom_rois[16] * spatial_scale;\n    T p5_x = offset_bottom_rois[13] * spatial_scale;\n    T p5_y = offset_bottom_rois[14] * spatial_scale;\n    T p6_x = offset_bottom_rois[11] * spatial_scale;\n    T p6_y = offset_bottom_rois[12] * spatial_scale;\n    T p7_x = offset_bottom_rois[9] * spatial_scale;\n    T p7_y = offset_bottom_rois[10] * spatial_scale;\n\n    // compute the coords\n    const T u = pw / static_cast<T>(pooled_width);\n    const T v = ph / static_cast<T>(pooled_height);\n    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n    const T x_center = x1 * v + x0 * (1. - v) - offset;\n    const T y_center = y1 * v + y0 * (1. - v) - offset;\n\n    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));\n    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    T *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels + c) * height * width;\n\n    int top_offset = (n * channels + c) * pooled_height * pooled_width;\n    const T *offset_top_diff = top_diff + top_offset;\n    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1\n    {\n      const T y = y_center - (T)0.5 * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = x_center - (T)0.5 * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        T g1 = top_diff_this_bin * w1 / count;\n        T g2 = top_diff_this_bin * w2 / count;\n        T g3 = top_diff_this_bin * w3 / count;\n        T g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low,\n                    static_cast<T>(g1));\n          atomicAdd(offset_bottom_diff + y_low * width + x_high,\n                    static_cast<T>(g2));\n          atomicAdd(offset_bottom_diff + y_high * width + x_low,\n                    static_cast<T>(g3));\n          atomicAdd(offset_bottom_diff + y_high * width + x_high,\n                    static_cast<T>(g4));\n        }  // if\n      }    // ix\n    }      // iy\n  }        // MUSA_1D_KERNEL_LOOP\n}  // BezierAlignBackward\n\n#endif  // BEZIER_ALIGN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/border_align_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.\n// the main difference: (1) use `argmax_idx` for fast computing of gradient\n// during the backward. (2) `wh` is directly computed by `boxes`, rather than\n// passing it as argument to forward or backward functions.\n\n#ifndef BORDER_ALIGN_MUSA_KERNEL_MUH\n#define BORDER_ALIGN_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\nenum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };\n\n/*** Forward ***/\ntemplate <typename T>\n__global__ void border_align_forward_musa_kernel(\n    const int nthreads, const T* input, const T* boxes, T* output,\n    int* argmax_idx, const int channels, const int box_size, const int height,\n    const int width, const int pool_size) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (batch_idx, c_idx, box_idx) is an element paralleled for computing\n    // output, and `extreme_idx` is in range [0,3]\n    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;\n    const T *offset_box, *offset_input, *offset_box_x;\n    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,\n        val, maxval;\n\n    extreme_idx = threadIdx.y;\n    // shape (N, C, box_size, 4) for output\n    batch_idx = index / channels / box_size;\n    // shape (N, box_size, 4) for boxes\n    box_idx = index % box_size + batch_idx * box_size;\n    c_idx = (index / box_size) % channels;\n\n    offset_box = boxes + box_idx * 4;\n    box_width = *(offset_box + 2) - *offset_box;\n    box_height = *(offset_box + 3) - *(offset_box + 1);\n    offset_output = output + index * 4 + extreme_idx;\n    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;\n    // shape (N, 4C, h, w) for input.\n    // [0,C) for top feature, [C,2C) for left feature,\n    // [2C,3C) for bottom feature, [3C,4C) for right feature\n    offset_input =\n        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *\n                    height * width;\n\n    // extreme_idx in [0,1] -> offset_box_x indexed at x1\n    // extreme_idx in [2,3] -> offset_box_x indexed at x2\n    offset_box_x = offset_box + extreme_idx / 2 * 2;\n\n    // (x1,y1) or (x2,y2) for (x,y)\n    x = *offset_box_x;\n    y = *(offset_box_x + 1);\n\n    switch (extreme_idx) {\n      // top\n      case BorderMode::Top:\n        stride = box_width / pool_size;\n        x_stride = stride;\n        y_stride = 0;\n        break;\n      // left\n      case BorderMode::Left:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = stride;\n        break;\n      // bottom\n      case BorderMode::Bottom:\n        stride = box_width / pool_size;\n        x_stride = -stride;\n        y_stride = 0;\n        break;\n      // right\n      case BorderMode::Right:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = -stride;\n        break;\n    }\n\n    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or\n    // (x2,y2))\n    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);\n    maxidx = 0;\n\n    // do max_pool along the border\n    for (int i = 1; i <= pool_size; i++) {\n      x += x_stride;\n      y += y_stride;\n      val = bilinear_interpolate(offset_input, height, width, y, x, index);\n      if (val > maxval) {\n        maxval = val;\n        maxidx = i;\n      }\n    }\n\n    // update output and argmax_idx\n    *offset_output = maxval;\n    *offset_argmax_idx = maxidx;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename T>\n__global__ void border_align_backward_musa_kernel(\n    const int nthreads, const T* grad_output, const T* boxes,\n    const int* argmax_idx, T* grad_input, const int channels,\n    const int box_size, const int height, const int width,\n    const int pool_size) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (batch_idx, c_idx, box_idx) is an element paralleled for computing\n    // output, and `extreme_idx` is in range [0,3]\n    int batch_idx, c_idx, box_idx, extreme_idx;\n    const int* offset_argmax_idx;\n    const T *offset_grad_output, *offset_box, *offset_box_x;\n    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,\n        y;\n\n    extreme_idx = threadIdx.y;\n    batch_idx = index / channels / box_size;\n    box_idx = index % box_size + batch_idx * box_size;\n    c_idx = (index / box_size) % channels;\n\n    offset_box = boxes + box_idx * 4;\n    box_width = *(offset_box + 2) - *offset_box;\n    box_height = *(offset_box + 3) - *(offset_box + 1);\n    offset_grad_output = grad_output + index * 4 + extreme_idx;\n    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;\n    // [0,C) for top feature grad, [C,2C) for left feature grad,\n    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad\n    offset_grad_input = grad_input + (batch_idx * channels * 4 +\n                                      extreme_idx * channels + c_idx) *\n                                         height * width;\n\n    // extreme_idx in [0,1] -> offset_box_x indexed at x1\n    // extreme_idx in [2,3] -> offset_box_x indexed at x2\n    offset_box_x = offset_box + extreme_idx / 2 * 2;\n\n    switch (extreme_idx) {\n      // top\n      case BorderMode::Top:\n        stride = box_width / pool_size;\n        x_stride = stride;\n        y_stride = 0;\n        break;\n      // left\n      case BorderMode::Left:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = stride;\n        break;\n      // bottom\n      case BorderMode::Bottom:\n        stride = box_width / pool_size;\n        x_stride = -stride;\n        y_stride = 0;\n        break;\n      // right\n      case BorderMode::Right:\n        stride = box_height / pool_size;\n        x_stride = 0;\n        y_stride = -stride;\n        break;\n    }\n\n    // get position (x,y) which has maximum value during forward\n    x = *offset_box_x;\n    y = *(offset_box_x + 1);\n    x += x_stride * (T)(*offset_argmax_idx);\n    y += y_stride * (T)(*offset_argmax_idx);\n\n    T w1, w2, w3, w4;\n    int x_low, x_high, y_low, y_high;\n    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,\n                                  x_high, y_low, y_high, index);\n\n    // update grad_output\n    atomicAdd(offset_grad_input + y_low * width + x_low,\n              *offset_grad_output * w1);\n    atomicAdd(offset_grad_input + y_low * width + x_high,\n              *offset_grad_output * w2);\n    atomicAdd(offset_grad_input + y_high * width + x_low,\n              *offset_grad_output * w3);\n    atomicAdd(offset_grad_input + y_high * width + x_high,\n              *offset_grad_output * w4);\n  }\n}\n\n#endif  // BORDER_ALIGN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/box_iou_quadri_musa.muh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef BOX_IOU_QUADRI_MUSA_MUH\n#define BOX_IOU_QUADRI_MUSA_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"box_iou_rotated_utils.hpp\"\n\n// 2D block with 32 * 16 = 512 threads per block\nconst int BLOCK_DIM_X = 32;\nconst int BLOCK_DIM_Y = 16;\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\ntemplate <typename T>\n__global__ void box_iou_quadri_musa_kernel(\n    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,\n    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {\n  if (aligned) {\n    MUSA_1D_KERNEL_LOOP(index, n_boxes1) {\n      int b1 = index;\n      int b2 = index;\n\n      int base1 = b1 * 8;\n\n      float block_boxes1[8];\n      float block_boxes2[8];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n      block_boxes1[5] = dev_boxes1[base1 + 5];\n      block_boxes1[6] = dev_boxes1[base1 + 6];\n      block_boxes1[7] = dev_boxes1[base1 + 7];\n\n      int base2 = b2 * 8;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n      block_boxes2[5] = dev_boxes2[base2 + 5];\n      block_boxes2[6] = dev_boxes2[base2 + 6];\n      block_boxes2[7] = dev_boxes2[base2 + 7];\n\n      dev_ious[index] =\n          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  } else {\n    MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {\n      int b1 = index / n_boxes2;\n      int b2 = index % n_boxes2;\n\n      int base1 = b1 * 8;\n\n      float block_boxes1[8];\n      float block_boxes2[8];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n      block_boxes1[5] = dev_boxes1[base1 + 5];\n      block_boxes1[6] = dev_boxes1[base1 + 6];\n      block_boxes1[7] = dev_boxes1[base1 + 7];\n\n      int base2 = b2 * 8;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n      block_boxes2[5] = dev_boxes2[base2 + 5];\n      block_boxes2[6] = dev_boxes2[base2 + 6];\n      block_boxes2[7] = dev_boxes2[base2 + 7];\n\n      dev_ious[index] =\n          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/box_iou_rotated_musa.muh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu\n#ifndef BOX_IOU_ROTATED_MUSA_MUH\n#define BOX_IOU_ROTATED_MUSA_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"box_iou_rotated_utils.hpp\"\n\n// 2D block with 32 * 16 = 512 threads per block\nconst int BLOCK_DIM_X = 32;\nconst int BLOCK_DIM_Y = 16;\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\ntemplate <typename T>\n__global__ void box_iou_rotated_musa_kernel(\n    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,\n    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {\n  if (aligned) {\n    MUSA_1D_KERNEL_LOOP(index, n_boxes1) {\n      int b1 = index;\n      int b2 = index;\n\n      int base1 = b1 * 5;\n\n      float block_boxes1[5];\n      float block_boxes2[5];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n\n      int base2 = b2 * 5;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n\n      dev_ious[index] =\n          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  } else {\n    MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {\n      int b1 = index / n_boxes2;\n      int b2 = index % n_boxes2;\n\n      int base1 = b1 * 5;\n\n      float block_boxes1[5];\n      float block_boxes2[5];\n\n      block_boxes1[0] = dev_boxes1[base1 + 0];\n      block_boxes1[1] = dev_boxes1[base1 + 1];\n      block_boxes1[2] = dev_boxes1[base1 + 2];\n      block_boxes1[3] = dev_boxes1[base1 + 3];\n      block_boxes1[4] = dev_boxes1[base1 + 4];\n\n      int base2 = b2 * 5;\n\n      block_boxes2[0] = dev_boxes2[base2 + 0];\n      block_boxes2[1] = dev_boxes2[base2 + 1];\n      block_boxes2[2] = dev_boxes2[base2 + 2];\n      block_boxes2[3] = dev_boxes2[base2 + 3];\n      block_boxes2[4] = dev_boxes2[base2 + 4];\n\n      dev_ious[index] =\n          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/carafe_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_MUSA_KERNEL_MUH\n#define CARAFE_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n#ifdef MMCV_WITH_HIP\n#define WARP_SIZE 64\n#else\n#define WARP_SIZE 32\n#endif\n#define THREADS_PER_PIXEL 32\n#define MAX_SHARED_MEMORY 49152\n#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144\n#define MAXIMIZE_KERNEL_SIZE true\n#define kTileDim 32\n#define kBlockRows 8\n#define FULL_MASK 0xffffffff\n\ninline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }\n\n__device__ inline int Loc2Index(const int n, const int c, const int h,\n                                const int w, const int channel_num,\n                                const int height, const int width) {\n  int index = w + (h + (c + n * channel_num) * height) * width;\n  return index;\n}\n#ifndef MMCV_WITH_HIP\n/* TODO: move this to a common place */\ntemplate <typename scalar_t>\n__device__ inline scalar_t min(scalar_t a, scalar_t b) {\n  return a < b ? a : b;\n}\n\ntemplate <typename scalar_t>\n__device__ inline scalar_t max(scalar_t a, scalar_t b) {\n  return a > b ? a : b;\n}\n#endif\ntemplate <typename scalar_t>\n__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {\n  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n    val += __shfl_down(val, offset);\n#else\n    val += __shfl_down_sync(FULL_MASK, val, offset);\n#endif\n  return val;\n}\n\ntemplate <>\n__device__ __forceinline__ phalf warpReduceSum(phalf val) {\n  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n    // Using PyTorch's macro for half support\n    __PHALF(val) += WARP_SHFL_DOWN(val, offset);\n#else\n    __PHALF(val) +=\n        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);\n#endif\n  return val;\n}\n\n// Splits the original matrix into submatrices with size 32 * 32.\n// Each block transposes one submatrix by loading it into shared memory.\n// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/\ntemplate <typename scalar_t>\n__global__ void BatchTranspose2DMUSAKernel(const int N, const int H,\n                                           const int W, const int dh,\n                                           const int dw,\n                                           const scalar_t *__restrict__ X,\n                                           scalar_t *__restrict__ Y) {\n  __shared__ scalar_t tile[kTileDim][kTileDim + 1];\n  const int n = blockIdx.x / (dh * dw);\n  const int k = blockIdx.x % (dh * dw);\n  const int r = k / dw;\n  const int c = k % dw;\n  const int offset = n * H * W;\n  int x = c * kTileDim + threadIdx.x;\n  int y = r * kTileDim + threadIdx.y;\n  if (x < W) {\n    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {\n      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];\n    }\n  }\n  __syncthreads();\n  x = r * kTileDim + threadIdx.x;\n  y = c * kTileDim + threadIdx.y;\n  if (x < H) {\n    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {\n      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void CARAFEForward(\n    const int num_kernels, const scalar_t *__restrict__ bottom_data,\n    const scalar_t *__restrict__ bottom_masks, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int down_height, const int down_width, const int height,\n    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {\n#if MAXIMIZE_KERNEL_SIZE\n  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];\n#else\n  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];\n#endif\n\n\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int down_pw = pw / scale_factor;\n  const int down_ph = ph / scale_factor;\n\n  const int start_w = down_pw - (kernel_size - 1) / 2;\n  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n  const int start_h = down_ph - (kernel_size - 1) / 2;\n  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {\n    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);\n    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];\n  }\n  __syncthreads();\n\n\n  const int channels_per_group = ceilf(channels / (float)group_size);\n#pragma unroll\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    int mask_group = c / channels_per_group;\n    scalar_t output_val = 0;\n#pragma unroll\n    for (int iy = start_h; iy < end_h; iy++) {\n#pragma unroll\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, iy, ix, c, down_height, down_width, channels);\n\n        output_val += bottom_data[feat_index] *\n                      shared_mask[mask_c * WARP_SIZE + pixel_id];\n      }\n    }\n\n    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);\n    top_data[top_index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void CARAFEBackward_Feature(\n    const int num_kernels, const scalar_t *__restrict__ top_diff,\n    const scalar_t *__restrict__ bottom_masks, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int down_height, const int down_width, const int height,\n    const int width, const int mask_channels,\n    scalar_t *__restrict__ bottom_diff) {\n#if MAXIMIZE_KERNEL_SIZE\n  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];\n#else\n  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];\n#endif\n\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n\n  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  // (n, c, ph, pw) is an element in the bottom_data\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;\n  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;\n  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;\n  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;\n  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {\n    const int mask_w = (c % kernel_size) * scale_factor;\n    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;\n    const int mask_x = start_w + mask_w;\n    const int mask_y = start_h + mask_h;\n    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {\n      shared_mask[c * WARP_SIZE + pixel_id] = 0;\n      continue;\n    }\n    const int mask_group = c / (kernel_size * kernel_size);\n    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;\n    int mask_index =\n        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);\n    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];\n  }\n  __syncthreads();\n  const int channels_per_group = ceilf(channels / (float)group_size);\n#pragma unroll\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    int mask_group = c / channels_per_group;\n    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);\n    scalar_t output_val = 0;\n#pragma unroll\n    for (int iy = start_h; iy < end_h; iy += scale_factor) {\n#pragma unroll\n      for (int ix = start_w; ix < end_w; ix += scale_factor) {\n        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {\n          continue;\n        }\n        int mask_iy =\n            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;\n        int mask_ix =\n            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);\n        output_val +=\n            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];\n      }\n    }\n    bottom_diff[top_index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void FeatureSum(const int num_kernels,\n                           const scalar_t *__restrict__ input_data,\n                           const int scale_factor, const int channels,\n                           const int height, const int width,\n                           scalar_t *__restrict__ output_data) {\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n  const int split_id = threadIdx.x % THREADS_PER_PIXEL;\n  index = index / THREADS_PER_PIXEL;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {\n    scalar_t output_val = 0;\n    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {\n      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {\n        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,\n                                 width * scale_factor, channels);\n        output_val += input_data[input_id];\n      }\n    }\n    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);\n    output_data[output_id] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void CARAFEBackward_Mask(const int num_kernels,\n                                    const scalar_t *__restrict__ top_diff,\n                                    const scalar_t *__restrict__ bottom_data,\n                                    const int kernel_size, const int group_size,\n                                    const int scale_factor, const int channels,\n                                    const int down_height, const int down_width,\n                                    const int height, const int width,\n                                    const int mask_channels,\n                                    scalar_t *__restrict__ mask_diff) {\n  int index = threadIdx.x + blockIdx.x * blockDim.x;\n  if (index > num_kernels - 1) {\n    return;\n  }\n\n  const int lane_id = index % WARP_SIZE;\n  index = index / WARP_SIZE;\n  const int mask_c = index % mask_channels;\n  // (n, c, ph, pw) is an element in the bottom_data\n  index = index / mask_channels;\n  const int pw = index % width;\n  const int ph = (index / width) % height;\n  const int n = index / width / height;\n\n  const int down_pw = pw / scale_factor;\n  const int down_ph = ph / scale_factor;\n\n  const int mask_group = mask_c / (kernel_size * kernel_size);\n  const int mask_loc = mask_c % (kernel_size * kernel_size);\n\n  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;\n  const int offset_y =\n      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;\n\n  const int down_x = down_pw + offset_x;\n  const int down_y = down_ph + offset_y;\n\n  scalar_t output_val = 0;\n\n  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&\n      down_x <= down_width - 1) {\n    const int channels_per_mask = ceilf(channels / (float)group_size);\n    const int start = channels_per_mask * mask_group;\n    const int end = min(channels_per_mask * (mask_group + 1), channels);\n    for (int c = start + lane_id; c < end; c += WARP_SIZE) {\n      int bottom_id =\n          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);\n      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);\n      output_val += top_diff[top_id] * bottom_data[bottom_id];\n    }\n  }\n#ifdef MMCV_WITH_HIP\n  __syncthreads();\n#else\n  __syncwarp();\n#endif\n  output_val = warpReduceSum(output_val);\n  if (lane_id == 0) {\n    const int mask_id =\n        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);\n    mask_diff[mask_id] = output_val;\n  }\n}\n\n#endif  // CARAFE_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/carafe_naive_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_NAIVE_MUSA_KERNEL_MUH\n#define CARAFE_NAIVE_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n__device__ inline int Loc2Index(const int n, const int c, const int h,\n                                const int w, const int channel_num,\n                                const int height, const int width) {\n  int index = w + (h + (c + n * channel_num) * height) * width;\n  return index;\n}\n\ntemplate <typename scalar_t>\n__global__ void carafe_naive_forward_musa_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,\n    const int group_size, const int scale_factor, const int channels,\n    const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the bottom_data\n    int pw = index % width;\n    int ph = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int mask_channels = kernel_size * kernel_size * group_size;\n    int mask_group = c / (channels / group_size);\n\n    int down_pw = pw / scale_factor;\n    int down_ph = ph / scale_factor;\n    int down_width = width / scale_factor;\n    int down_height = height / scale_factor;\n    int start_w = down_pw - (kernel_size - 1) / 2;\n    int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n    int start_h = down_ph - (kernel_size - 1) / 2;\n    int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n\n    scalar_t output_val = 0;\n    for (int iy = start_h; iy < end_h; iy++) {\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, c, iy, ix, channels, down_height, down_width);\n        int mask_index =\n            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);\n        output_val += bottom_data[feat_index] * bottom_masks[mask_index];\n      }\n    }\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void carafe_naive_backward_musa_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,\n    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,\n    const int kernel_size, const int group_size, const int scale_factor,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the bottom_data\n    int pw = index % width;\n    int ph = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    int mask_channels = kernel_size * kernel_size * group_size;\n    int mask_group = c / (channels / group_size);\n\n    int down_pw = pw / scale_factor;\n    int down_ph = ph / scale_factor;\n    int down_width = width / scale_factor;\n    int down_height = height / scale_factor;\n    int start_w = down_pw - (kernel_size - 1) / 2;\n    int end_w = down_pw + (kernel_size - 1) / 2 + 1;\n    int start_h = down_ph - (kernel_size - 1) / 2;\n    int end_h = down_ph + (kernel_size - 1) / 2 + 1;\n\n    for (int iy = start_h; iy < end_h; iy++) {\n      for (int ix = start_w; ix < end_w; ix++) {\n        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {\n          continue;\n        }\n        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;\n        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;\n        int mask_c =\n            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;\n        int feat_index =\n            Loc2Index(n, c, iy, ix, channels, down_height, down_width);\n        int mask_index =\n            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);\n        atomicAdd(bottom_diff + feat_index,\n                  bottom_masks[mask_index] * top_diff[index]);\n        atomicAdd(mask_diff + mask_index,\n                  bottom_data[feat_index] * top_diff[index]);\n      }\n    }\n  }\n}\n\n#endif  // CARAFE_NAIVE_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/chamfer_distance_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu\n#ifndef CHAMFER_DISTANCE_MUSA_KERNEL_MUH\n#define CHAMFER_DISTANCE_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144\n\n#if MUSA_ARCH > 21\ntemplate <typename scalar_t>\n__global__ void chamfer_distance_forward_musa_kernel(int b, int n,\n                                                     const scalar_t* xyz, int m,\n                                                     const scalar_t* xyz2,\n                                                     scalar_t* result,\n                                                     int* result_i) {\n  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];\n  for (int i = blockIdx.x; i < b; i += gridDim.x) {\n    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {\n      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;\n      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {\n        buf[j] = xyz2[(i * m + k2) * 2 + j];\n      }\n      __syncthreads();\n      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {\n        scalar_t x1 = xyz[(i * n + j) * 2 + 0];\n        scalar_t y1 = xyz[(i * n + j) * 2 + 1];\n        int best_i = 0;\n        scalar_t best = 1e10;\n        int end_ka = end_k & (~3);\n        if (end_ka == THREADS_PER_BLOCK) {\n          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {\n#pragma unroll\n            for (int j = 0; j < 4; ++j) {\n              scalar_t x2 = buf[(k + j) * 2] - x1;\n              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;\n              scalar_t d = x2 * x2 + y2 * y2;\n              if (d < best) {\n                best = d;\n                best_i = k + k2 + j;\n              }\n            }\n          }\n        } else {\n          for (int k = 0; k < end_ka; k += 4) {\n#pragma unroll\n            for (int j = 0; j < 4; ++j) {\n              scalar_t x2 = buf[(k + j) * 2] - x1;\n              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;\n              scalar_t d = x2 * x2 + y2 * y2;\n              if (d < best) {\n                best = d;\n                best_i = k + k2 + j;\n              }\n            }\n          }\n        }\n        for (int k = end_ka; k < end_k; k++) {\n          scalar_t x2 = buf[k * 2 + 0] - x1;\n          scalar_t y2 = buf[k * 2 + 1] - y1;\n          scalar_t d = x2 * x2 + y2 * y2;\n          if (k == 0 || d < best) {\n            best = d;\n            best_i = k + k2;\n          }\n        }\n        if (k2 == 0 || result[(i * n + j)] > best) {\n          result[(i * n + j)] = best;\n          result_i[(i * n + j)] = best_i;\n        }\n      }\n      __syncthreads();\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void chamfer_distance_backward_musa_kernel(\n    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,\n    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,\n    scalar_t* grad_xyz2) {\n  for (int i = blockIdx.x; i < b; i += gridDim.x) {\n    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {\n      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];\n      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];\n      int j2 = idx1[i * n + j];\n      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];\n      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];\n      scalar_t g = grad_dist1[i * n + j] * 2;\n      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));\n      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));\n      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));\n      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));\n    }\n  }\n}\n#else\n#warning \"chamfer_distance is supported when MUSA_ARCH > 21\"\n#endif  //MUSA_ARCH\n\n#endif  // CHAMFER_DISTANCE_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/common_musa_helper.hpp",
    "content": "#ifndef COMMON_MUSA_HELPER\n#define COMMON_MUSA_HELPER\n\n#include <musa.h>\n\n#define MUSA_1D_KERNEL_LOOP(i, n)                              \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \\\n       i += blockDim.x * gridDim.x)\n\n#define MUSA_2D_KERNEL_LOOP(i, n, j, m)                             \\\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \\\n       i += blockDim.x * gridDim.x)                                 \\\n    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \\\n         j += blockDim.y * gridDim.y)\n\n#define MUSA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \\\n  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \\\n    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)\n\n#define THREADS_PER_BLOCK 512\n\ninline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {\n  int optimal_block_num = (N + num_threads - 1) / num_threads;\n  int max_block_num = 4096;\n  return min(optimal_block_num, max_block_num);\n}\n\ntemplate <typename T>\n__device__ T bilinear_interpolate(const T* input, const int height,\n                                  const int width, T y, T x,\n                                  const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  int y_low = (int)y;\n  int x_low = (int)x;\n  int y_high;\n  int x_high;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n  // do bilinear interpolation\n  T v1 = input[y_low * width + x_low];\n  T v2 = input[y_low * width + x_high];\n  T v3 = input[y_high * width + x_low];\n  T v4 = input[y_high * width + x_high];\n  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  return val;\n}\n\ntemplate <typename T>\n__device__ void bilinear_interpolate_gradient(\n    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,\n    int& x_low, int& x_high, int& y_low, int& y_high,\n    const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  // reference in forward\n  // T v1 = input[y_low * width + x_low];\n  // T v2 = input[y_low * width + x_high];\n  // T v3 = input[y_high * width + x_low];\n  // T v4 = input[y_high * width + x_high];\n  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  return;\n}\n#endif  // COMMON_MUSA_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/convex_iou_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CONVEX_IOU_MUSA_KERNEL_MUH\n#define CONVEX_IOU_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n#define MAXN 100\n#define NMAX 512\n__device__ const double EPS = 1E-8;\n\n__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }\n\nstruct Point {\n  double x, y;\n  __device__ Point() {}\n  __device__ Point(double x, double y) : x(x), y(y) {}\n};\n\n__device__ inline bool point_same(Point& a, Point& b) {\n  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;\n}\n\n__device__ inline void swap1(Point* a, Point* b) {\n  Point temp;\n  temp.x = a->x;\n  temp.y = a->y;\n\n  a->x = b->x;\n  a->y = b->y;\n\n  b->x = temp.x;\n  b->y = temp.y;\n}\n\n__device__ inline void reverse1(Point* a, const int n) {\n  for (int i = 0; i < (n - 1) / 2.0; i++) {\n    Point* j = &(a[i]);\n    Point* k = &(a[n - 1 - i]);\n    swap1(j, k);\n  }\n}\n\n__device__ inline double cross(Point o, Point a, Point b) {\n  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);\n}\n\n__device__ inline double dis(Point a, Point b) {\n  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);\n}\n__device__ inline double area(Point* ps, int n) {\n  ps[n] = ps[0];\n  double res = 0;\n  for (int i = 0; i < n; i++) {\n    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;\n  }\n  return res / 2.0;\n}\n__device__ inline double polygon_area_grad(Point* ps, int n,\n                                           int* polygon_to_pred_index,\n                                           int n_pred, double* grad_C) {\n  ps[n] = ps[0];\n  double partion_grad[4 * 30 + 2];\n  double res = 0;\n  for (int i = 0; i < n; i++) {\n    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;\n    partion_grad[i * 4 + 2] = ps[i + 1].y;\n    partion_grad[i * 4 + 3] = -ps[i + 1].x;\n    if (i != n - 1) {\n      partion_grad[i * 4 + 4] = -ps[i].y;\n      partion_grad[i * 4 + 5] = ps[i].x;\n    } else {\n      partion_grad[0] = -ps[i].y;\n      partion_grad[1] = ps[i].x;\n    }\n  }\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n_pred; j++) {\n      if (i == polygon_to_pred_index[j]) {\n        grad_C[2 * polygon_to_pred_index[j + n_pred]] =\n            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;\n        break;\n      }\n    }\n    for (int j = 0; j < n_pred; j++) {\n      if (i == polygon_to_pred_index[j]) {\n        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =\n            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;\n        break;\n      }\n    }\n  }\n\n  return res / 2.0;\n}\n\n__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,\n                                double* cut_grad, int m, int n, int i) {\n  double s1, s2;\n  double s2_s1_2;\n  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;\n  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;\n  s1 = cross(a, b, c);\n  s2 = cross(a, b, d);\n\n  ds1_dxc = -(b.y - a.y);\n  ds1_dyc = b.x - a.x;\n  ds2_dxd = ds1_dxc;\n  ds2_dyd = ds1_dyc;\n  s2_s1_2 = (s2 - s1) * (s2 - s1);\n\n  if (sig(s1) == 0 && sig(s2) == 0) return 2;\n  if (sig(s2 - s1) == 0) return 0;\n\n  dxp_dxc =\n      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /\n      (s2_s1_2);\n  dxp_dyc =\n      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /\n      (s2_s1_2);\n  dxp_dxd =\n      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /\n      (s2_s1_2);\n  dxp_dyd =\n      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /\n      (s2_s1_2);\n\n  dyp_dxc =\n      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /\n      (s2_s1_2);\n  dyp_dyc =\n      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /\n      (s2_s1_2);\n  dyp_dxd =\n      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /\n      (s2_s1_2);\n  dyp_dyd =\n      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /\n      (s2_s1_2);\n\n  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);\n  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);\n  if (i == n - 1) {\n    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;\n    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;\n    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;\n    cut_grad[4 * n * m + 1] = dyp_dxd;\n    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;\n    cut_grad[4 * n * m + 3] = dyp_dyd;\n  } else {\n    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;\n    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;\n    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;\n    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;\n    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;\n  }\n\n  return 1;\n}\n__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,\n                                   double* cut_grad) {\n  Point pp[MAXN];\n  double ccur_grad[MAXN] = {};\n  int m = 0;\n  p[n] = p[0];\n  int k = n;\n  for (int i = 0; i < n; i++) {\n    if (sig(cross(a, b, p[i])) > 0) {\n      pp[m] = p[i];\n      ccur_grad[4 * n * m + 4 * i] = 1.0;\n      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;\n      m++;\n    }\n    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {\n      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);\n      m++;\n    }\n  }\n\n  n = 0;\n  for (int i = 0; i < m; i++) {\n    if (!i || !(point_same(pp[i], pp[i - 1]))) {\n      p[n] = pp[i];\n      for (int j = 0; j < 4 * k; j++) {\n        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];\n      }\n      n++;\n    }\n  }\n\n  while (n > 1 && point_same(p[n - 1], p[0])) n--;\n}\n\n__device__ inline double intersectArea(Point a, Point b, Point c, Point d,\n                                       double* grad_AB, int order,\n                                       int convex_n) {\n  Point o(0, 0);\n  int res_flag = 0;\n  int s1 = sig(cross(o, a, b));\n  int s2 = sig(cross(o, c, d));\n  if (s1 == 0 || s2 == 0) return 0.0;\n  if (s1 == -1) {\n    Point* i = &a;\n    Point* j = &b;\n    swap1(i, j);\n    res_flag = 1;\n  }\n  if (s2 == -1) {\n    Point* i = &c;\n    Point* j = &d;\n    swap1(i, j);\n  }\n  Point p[10] = {o, a, b};\n  int n = 3, n0 = 3, n1, n2, n3;\n  double cut_grad1[MAXN] = {};\n  double cut_grad2[MAXN] = {};\n  double cut_grad3[MAXN] = {};\n  double p1_p_grad[10][10] = {};\n  double p2_p1_grad[10][10] = {};\n  double p3_p2_grad[10][10] = {};\n\n  double p3_p1_grad[10][10] = {};\n  double p3_p_grad[10][10] = {};\n\n  // 1\n  polygon_cut(p, n, o, c, cut_grad1);\n  n1 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n0; j++) {\n      if (!(j % 2)) {\n        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];\n      } else {\n        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];\n      }\n    }\n  }\n\n  // 2\n  polygon_cut(p, n, c, d, cut_grad2);\n  n2 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n1; j++) {\n      if (!(j % 2)) {\n        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];\n      } else {\n        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];\n      }\n    }\n  }\n  // 3\n  polygon_cut(p, n, d, o, cut_grad3);\n  n3 = n;\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < 4 * n2; j++) {\n      if (!(j % 2)) {\n        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];\n      } else {\n        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];\n      }\n    }\n  }\n\n  // mul\n  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)\n  for (int i = 0; i < 2 * n3; i++) {\n    for (int j = 0; j < 2 * n1; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n2; m++) {\n        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];\n      }\n      p3_p1_grad[i][j] = sum;\n    }\n  }\n\n  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)\n  for (int i = 0; i < 2 * n3; i++) {\n    for (int j = 0; j < 2 * n0; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n1; m++) {\n        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];\n      }\n      p3_p_grad[i][j] = sum;\n    }\n  }\n\n  // calculate S_grad\n  int polygon_index_box_index[20];\n  double grad_polygon[20];\n  double S_grad[6];\n\n  for (int i = 0; i < n3; i++) {\n    polygon_index_box_index[i] = i;\n    polygon_index_box_index[i + n3] = i;\n  }\n\n  double res =\n      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);\n\n  if (s1 * s2 == -1) {\n    for (int j = 0; j < 2 * 3; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n3; m++) {\n        sum = sum - grad_polygon[m] * p3_p_grad[m][j];\n      }\n      S_grad[j] = sum;\n    }\n\n    if (order != convex_n - 1) {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[2 * order + 2] += S_grad[2];\n        grad_AB[2 * order + 3] += S_grad[3];\n\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[2 * order + 2] += S_grad[4];\n        grad_AB[2 * order + 3] += S_grad[5];\n      }\n    } else {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[0] += S_grad[2];\n        grad_AB[1] += S_grad[3];\n\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[0] += S_grad[4];\n        grad_AB[1] += S_grad[5];\n      }\n    }\n    res = -res;\n  } else {\n    for (int j = 0; j < 2 * 3; j++) {\n      double sum = 0.0;\n      for (int m = 0; m < 2 * n3; m++) {\n        sum = sum + grad_polygon[m] * p3_p_grad[m][j];\n      }\n      S_grad[j] = sum;\n    }\n\n    if (order != convex_n - 1) {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[2 * order + 2] += S_grad[2];\n        grad_AB[2 * order + 3] += S_grad[3];\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[2 * order + 2] += S_grad[4];\n        grad_AB[2 * order + 3] += S_grad[5];\n      }\n    } else {\n      if (res_flag) {\n        grad_AB[2 * order] += S_grad[4];\n        grad_AB[2 * order + 1] += S_grad[5];\n        grad_AB[0] += S_grad[2];\n        grad_AB[1] += S_grad[3];\n      } else {\n        grad_AB[2 * order] += S_grad[2];\n        grad_AB[2 * order + 1] += S_grad[3];\n        grad_AB[0] += S_grad[4];\n        grad_AB[1] += S_grad[5];\n      }\n    }\n  }\n  return res;\n}\n\n__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,\n                                        double* grad_AB) {\n  if (area(ps1, n1) < 0) reverse1(ps1, n1);\n  if (area(ps2, n2) < 0) reverse1(ps2, n2);\n  ps1[n1] = ps1[0];\n  ps2[n2] = ps2[0];\n  double res = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n2; j++) {\n      res +=\n          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);\n    }\n  }\n  return res;\n}\n\n__device__ inline void Jarvis(Point* in_poly, int& n_poly) {\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[NMAX] = {}, top1, top2;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point* j = &(in_poly[0]);\n      Point* k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n}\n\n__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,\n                                           int n2, double* grad_C) {\n  Point polygon[MAXN];\n  int n = n1 + n2, n_poly = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n - n1; j++) {\n      if (point_same(ps1[i], ps2[j])) {\n        for (int k = j; k < n - n1 - 1; k++) {\n          ps2[k] = ps2[k + 1];\n        }\n        n2--;\n        break;\n      }\n    }\n  }\n  n_poly = n1 + n2;\n  for (int i = 0; i < n_poly; i++) {\n    if (i < n1) {\n      polygon[i] = ps1[i];\n    } else {\n      polygon[i] = ps2[i - n1];\n    }\n  }\n\n  Jarvis(polygon, n_poly);\n\n  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,\n                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};\n  int n_pred = 0;\n  for (int i = 0; i < n_poly; i++) {\n    for (int j = 0; j < n1; j++) {\n      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {\n        polygon_to_pred_index[n_pred] = i;\n        polygon_to_pred_index[n_pred + n1] = j;\n        n_pred += 1;\n        break;\n      }\n    }\n  }\n  if (n_pred == 0) {\n    double polygon_area = fabs(area(polygon, n_poly));\n    for (int i = 0; i < 18; i++) {\n      grad_C[i] = 0.0;\n    }\n    return polygon_area;\n  } else {\n    double polygon_area =\n        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);\n    if (polygon_area < 0) {\n      for (int i = 0; i < 18; i++) {\n        grad_C[i] = -grad_C[i];\n      }\n    }\n    return fabs(polygon_area);\n  }\n}\n\n// convex_find and get the polygon_index_box_index\n__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,\n                                        int* points_to_convex_ind) {\n  int n_input = n_poly;\n  Point input_poly[20];\n  for (int i = 0; i < n_input; i++) {\n    input_poly[i].x = in_poly[i].x;\n    input_poly[i].y = in_poly[i].y;\n  }\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[20], top1, top2;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point* j = &(in_poly[0]);\n      Point* k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n  for (int i = 0; i <= top1; i++) {\n    right_point[i] = in_poly[Stack[i]];\n  }\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n\n  for (int i = top2 - 1; i >= 0; i--) {\n    left_point[i] = in_poly[Stack[i]];\n  }\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n  for (int i = 0; i < n_poly; i++) {\n    for (int j = 0; j < n_input; j++) {\n      if (point_same(in_poly[i], input_poly[j])) {\n        points_to_convex_ind[i] = j;\n        break;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__device__ inline float devrIoU(T const* const p, T const* const q,\n                                T* point_grad, const int idx) {\n  Point ps1[MAXN], ps2[MAXN];\n\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = (double)p[i * 2];\n    convex[i].y = (double)p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};\n  Jarvis_and_index(convex, n_convex, points_to_convex_ind);\n\n  int n1 = n_convex;\n  int n2 = 4;\n\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = (double)convex[i].x;\n    ps1[i].y = (double)convex[i].y;\n  }\n\n  for (int i = 0; i < n2; i++) {\n    ps2[i].x = (double)q[i * 2];\n    ps2[i].y = (double)q[i * 2 + 1];\n  }\n\n  int polygon_index_box_index[18];\n  for (int i = 0; i < n1; i++) {\n    polygon_index_box_index[i] = i;\n    polygon_index_box_index[i + n1] = i;\n  }\n\n  double grad_A[18] = {};\n  double grad_AB[18] = {};\n  double grad_C[18] = {};\n\n  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);\n  double S_pred =\n      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);\n  if (S_pred < 0) {\n    for (int i = 0; i < n_convex * 2; i++) {\n      grad_A[i] = -grad_A[i];\n    }\n  }\n  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;\n\n  double iou = inter_area / union_area;\n  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);\n\n  //    printf(\"%d:live\\n\", idx);\n  double rot_giou = iou - (polygon_area - union_area) / polygon_area;\n\n  float grad_point_temp[18] = {};\n\n  for (int i = 0; i < n_convex; i++) {\n    int grad_point = points_to_convex_ind[i];\n    grad_point_temp[2 * grad_point] =\n        (float)((union_area + inter_area) / (union_area * union_area) *\n                    grad_AB[2 * i] -\n                iou / union_area * grad_A[2 * i] -\n                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -\n                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);\n    grad_point_temp[2 * grad_point + 1] =\n        (float)((union_area + inter_area) / (union_area * union_area) *\n                    grad_AB[2 * i + 1] -\n                iou / union_area * grad_A[2 * i + 1] -\n                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -\n                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);\n  }\n\n  for (int i = 0; i < 9; i++) {\n    point_grad[2 * i] = grad_point_temp[2 * i];\n    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];\n  }\n  return (float)rot_giou;\n}\n\ntemplate <typename T>\n__global__ void convex_giou_musa_kernel(const int ex_n_boxes,\n                                        const int gt_n_boxes, const T* ex_boxes,\n                                        const T* gt_boxes, T* point_grad) {\n  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T* cur_box = ex_boxes + index * 18;\n    const T* cur_gt_box = gt_boxes + index * 8;\n    T* cur_grad = point_grad + index * 19;\n    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);\n    cur_grad[18] = giou;\n  }\n}\n\n__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {\n  double s1, s2;\n  s1 = cross(a, b, c);\n  s2 = cross(a, b, d);\n  if (sig(s1) == 0 && sig(s2) == 0) return 2;\n  if (sig(s2 - s1) == 0) return 0;\n  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);\n  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);\n  return 1;\n}\n\n__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {\n  Point pp[MAXN];\n  int m = 0;\n  p[n] = p[0];\n  for (int i = 0; i < n; i++) {\n    if (sig(cross(a, b, p[i])) > 0) {\n      pp[m] = p[i];\n      m++;\n    }\n    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {\n      lineCross(a, b, p[i], p[i + 1], pp[m]);\n      m++;\n    }\n  }\n  n = 0;\n  for (int i = 0; i < m; i++) {\n    if (!i || !(point_same(pp[i], pp[i - 1]))) {\n      p[n] = pp[i];\n      n++;\n    }\n  }\n\n  while (n > 1 && point_same(p[n - 1], p[0])) n--;\n}\n\n__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {\n  Point o(0, 0);\n  int s1 = sig(cross(o, a, b));\n  int s2 = sig(cross(o, c, d));\n  if (s1 == 0 || s2 == 0) return 0.0;\n  if (s1 == -1) {\n    Point* i = &a;\n    Point* j = &b;\n    swap1(i, j);\n  }\n  if (s2 == -1) {\n    Point* i = &c;\n    Point* j = &d;\n    swap1(i, j);\n  }\n  Point p[10] = {o, a, b};\n  int n = 3;\n\n  polygon_cut(p, n, o, c);\n  polygon_cut(p, n, c, d);\n  polygon_cut(p, n, d, o);\n  double res = area(p, n);\n  if (s1 * s2 == -1) res = -res;\n  return res;\n}\n__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,\n                                        int n2) {\n  if (area(ps1, n1) < 0) reverse1(ps1, n1);\n  if (area(ps2, n2) < 0) reverse1(ps2, n2);\n  ps1[n1] = ps1[0];\n  ps2[n2] = ps2[0];\n  double res = 0;\n  for (int i = 0; i < n1; i++) {\n    for (int j = 0; j < n2; j++) {\n      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);\n    }\n  }\n  return res;\n}\n\ntemplate <typename T>\n__device__ inline float devrIoU(T const* const p, T const* const q) {\n  Point ps1[MAXN], ps2[MAXN];\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = (double)p[i * 2];\n    convex[i].y = (double)p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};\n  Jarvis_and_index(convex, n_convex, points_to_convex_ind);\n  int n1 = n_convex;\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = (double)convex[i].x;\n    ps1[i].y = (double)convex[i].y;\n  }\n  int n2 = 4;\n  for (int i = 0; i < n2; i++) {\n    ps2[i].x = (double)q[i * 2];\n    ps2[i].y = (double)q[i * 2 + 1];\n  }\n  double inter_area = intersectAreaO(ps1, n1, ps2, n2);\n  double S_pred = area(ps1, n1);\n  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;\n  double iou = inter_area / union_area;\n  return (float)iou;\n}\n\ntemplate <typename T>\n__global__ void convex_iou_musa_kernel(const int ex_n_boxes,\n                                       const int gt_n_boxes, const T* ex_boxes,\n                                       const T* gt_boxes, T* iou) {\n  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T* cur_box = ex_boxes + index * 18;\n    for (int i = 0; i < gt_n_boxes; i++) {\n      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);\n    }\n  }\n}\n#endif  // CONVEX_IOU_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/correlation_musa.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu\n// Original licence: Under MIT License\n\n#ifndef CORRELATION_MUSA\n#define CORRELATION_MUSA\n\n#include \"pytorch_musa_helper.hpp\"\n\n#include <musa.h>\n#include <musa_runtime.h>\n// Using <torch/extension.h> is recommended in the official documentation in\n// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.\n// However, we use <torch/types.h> for compatibility with MUSA 9.0\n// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.\n#include <torch/types.h>\n\n#include <iostream>\n#include <vector>\n\nusing namespace torch;\n\n#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>\n#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>\n#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)\n\n#define WARP_SIZE 32\n#define FULL_MASK 0xffffffff\n\ntemplate <typename scalar_t>\n__global__ void correlation_forward_musa_kernel(\n    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,\n    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,\n    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,\n    int oH, int oW) {\n  const int iH = rInput1.size(1);\n  const int iW = rInput1.size(2);\n  const int C = rInput1.size(3);\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y * blockDim.y + threadIdx.y;\n  const int w = blockIdx.z * blockDim.z + threadIdx.z;\n\n  if (h >= oH || w >= oW) return;\n\n  const int thread = threadIdx.x;\n\n  const int start_i = -padH + h * dH;\n  const int start_j = -padW + w * dW;\n\n  const int patchRadH = dilation_patchH * (patchH - 1) / 2;\n  const int patchRadW = dilation_patchW * (patchW - 1) / 2;\n\n  for (int ph = 0; ph < patchH; ++ph) {\n    int ph_dilated = ph * dilation_patchH - patchRadH;\n    for (int pw = 0; pw < patchW; ++pw) {\n      int pw_dilated = pw * dilation_patchW - patchRadW;\n      scalar_t prod_sum = 0.0f;\n      for (int i = 0; i < kH; ++i) {\n        int i1 = start_i + i * dilationH;\n        int i2 = i1 + ph_dilated;\n        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {\n          for (int j = 0; j < kW; ++j) {\n            int j1 = start_j + j * dilationW;\n            int j2 = j1 + pw_dilated;\n            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {\n              for (int c = thread; c < C; c += WARP_SIZE) {\n                scalar_t v1 = rInput1[n][i1][j1][c];\n                scalar_t v2 = rInput2[n][i2][j2][c];\n                prod_sum += v1 * v2;\n              }\n            }\n          }\n        }\n      }\n      // accumulate\n      for (int offset = 16; offset > 0; offset /= 2)\n#ifdef MMCV_WITH_HIP\n        prod_sum += __shfl_down(float(prod_sum), offset);\n#else\n        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);\n#endif\n      if (thread == 0) {\n        output[n][ph][pw][h][w] = prod_sum;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_musa_kernel_input1(\n    const TensorAcc5R grad_output, const TensorAcc4R input2,\n    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,\n    const int patchW, const int padH, const int padW, const int dilationH,\n    const int dilationW, const int dilation_patchH, const int dilation_patchW,\n    const int dH, const int dW) {\n  const int iH = input2.size(1);\n  const int iW = input2.size(2);\n  const int C = input2.size(3);\n\n  const int H = grad_output.size(3);\n  const int W = grad_output.size(4);\n\n  const int patchRadH = (patchH - 1) / 2;\n  const int patchRadW = (patchW - 1) / 2;\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y;\n  const int w = blockIdx.z;\n\n  const int h_2 = h + padH;\n  const int w_2 = w + padW;\n  const int min_h = h_2 - kH * dilationH;\n  const int min_w = w_2 - kW * dilationW;\n\n  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];\n  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);\n  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {\n    const int ph = i / patchW;\n    const int pw = i % patchW;\n    int i1 = h + dilation_patchH * (ph - patchRadH);\n    int j1 = w + dilation_patchW * (pw - patchRadW);\n\n    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n      scalar_t grad_val = 0.0f;\n      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {\n        int i2 = (h_3) / dH;\n        if (i2 * dH != h_3) continue;\n        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {\n          int j2 = (w_3) / dW;\n          if (j2 * dW != w_3) continue;\n          if (WITHIN_BOUNDS(i2, j2, H, W)) {\n            grad_val += grad_output[n][ph][pw][i2][j2];\n          }\n        }\n      }\n      grad_cache[i] = grad_val;\n    }\n  }\n  __syncthreads();\n\n  for (int c = threadIdx.x; c < C; c += blockDim.x) {\n    scalar_t grad_input_val = 0.0f;\n    for (int ph = 0; ph < patchH; ++ph) {\n      int i1 = h + dilation_patchH * (ph - patchRadH);\n      for (int pw = 0; pw < patchW; ++pw) {\n        int j1 = w + dilation_patchW * (pw - patchRadW);\n        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];\n        }\n      }\n    }\n    grad_input1[n][c][h][w] = grad_input_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_musa_kernel_input2(\n    const TensorAcc5R grad_output, const TensorAcc4R input1,\n    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,\n    int padW, int dilationH, int dilationW, int dilation_patchH,\n    int dilation_patchW, int dH, int dW) {\n  const int iH = input1.size(1);\n  const int iW = input1.size(2);\n  const int C = input1.size(3);\n\n  const int patchRadH = (patchH - 1) / 2;\n  const int patchRadW = (patchW - 1) / 2;\n\n  const int H = grad_output.size(3);\n  const int W = grad_output.size(4);\n\n  const int dilatedKH = kH * dilationH;\n  const int dilatedKW = kW * dilationW;\n\n  const int n = blockIdx.x;\n  const int h = blockIdx.y;\n  const int w = blockIdx.z;\n\n  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];\n  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);\n  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {\n    const int ph = i / patchW;\n    const int pw = i % patchW;\n    int i1 = h - dilation_patchH * (ph - patchRadH);\n    int j1 = w - dilation_patchW * (pw - patchRadW);\n\n    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n      scalar_t grad_val = 0.0f;\n\n      const int h_2 = i1 + padH;\n      const int w_2 = j1 + padW;\n      const int min_h = h_2 - dilatedKH;\n      const int min_w = w_2 - dilatedKW;\n\n      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {\n        int i2 = (h_3) / dH;\n        if (i2 * dH != h_3) continue;\n        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {\n          int j2 = (w_3) / dW;\n          if (j2 * dW != w_3) continue;\n          if (WITHIN_BOUNDS(i2, j2, H, W)) {\n            grad_val += grad_output[n][ph][pw][i2][j2];\n          }\n        }\n      }\n      grad_cache[i] = grad_val;\n    }\n  }\n  __syncthreads();\n\n  for (int c = threadIdx.x; c < C; c += blockDim.x) {\n    scalar_t grad_input_val = 0.0f;\n    for (int ph = 0; ph < patchH; ++ph) {\n      int i1 = h - dilation_patchH * (ph - patchRadH);\n      for (int pw = 0; pw < patchW; ++pw) {\n        int j1 = w - dilation_patchW * (pw - patchRadW);\n        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {\n          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];\n        }\n      }\n    }\n    grad_input2[n][c][h][w] = grad_input_val;\n  }\n}\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/deform_conv_musa_kernel.muh",
    "content": "/*!\n ******************* BEGIN Caffe Copyright Notice and Disclaimer\n *****************\n *\n * COPYRIGHT\n *\n * All contributions by the University of California:\n * Copyright (c) 2014-2017 The Regents of the University of California (Regents)\n * All rights reserved.\n *\n * All other contributions:\n * Copyright (c) 2014-2017, the respective contributors\n * All rights reserved.\n *\n * Caffe uses a shared copyright model: each contributor holds copyright over\n * their contributions to Caffe. The project versioning records all such\n * contribution and copyright details. If a contributor wants to further mark\n * their specific copyright on a particular contribution, they should indicate\n * their copyright solely in the commit message of the change when it is\n * committed.\n *\n * LICENSE\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions are met:\n *\n * 1. Redistributions of source code must retain the above copyright notice,\n *this list of conditions and the following disclaimer.\n * 2. Redistributions in binary form must reproduce the above copyright notice,\n * this list of conditions and the following disclaimer in the documentation\n * and/or other materials provided with the distribution.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n * CONTRIBUTION AGREEMENT\n *\n * By contributing to the BVLC/caffe repository through pull-request, comment,\n * or otherwise, the contributor releases their content to the\n * license and copyright terms herein.\n *\n ***************** END Caffe Copyright Notice and Disclaimer\n *********************\n *\n * Copyright (c) 2018 Microsoft\n * Licensed under The MIT License [see LICENSE for details]\n * \\file modulated_deformable_im2col.muh\n * \\brief Function definitions of converting an image to\n * column matrix based on kernel, padding, dilation, and offset.\n * These functions are mainly used in deformable convolution operators.\n * \\ref: https://arxiv.org/abs/1703.06211\n * \\author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng\n */\n\n// modified from\n// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu\n\n#ifndef DEFORM_CONV_MUSA_KERNEL_MUH\n#define DEFORM_CONV_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n\ntemplate <typename T>\n__device__ T deformable_im2col_bilinear(const T *input, const int data_width,\n                                        const int height, const int width, T h,\n                                        T w) {\n  if (h <= -1 || height <= h || w <= -1 || width <= w) {\n    return 0;\n  }\n\n  int h_low = floorf(h);\n  int w_low = floorf(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\n__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,\n                                 const int w, const int height,\n                                 const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\n__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,\n                                   const int width, const T *im_data,\n                                   const int data_width, const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\n__global__ void deformable_im2col_gpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const int height,\n    const int width, const int kernel_h, const int kernel_w, const int pad_h,\n    const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,\n                                           h_im, w_im);\n        *data_col_ptr = val;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void deformable_col2im_gpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const int channels,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index];\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,\n                                         cur_h + dy, cur_w + dx, height, width);\n          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void deformable_col2im_coord_gpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int offset_channels, const int deformable_group, const int height_col,\n    const int width_col, T *grad_offset) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    T val = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,\n                                             data_im_ptr + cnt * height * width,\n                                             width, bp_dir);\n      val += weight * data_col_ptr[col_pos];\n      cnt += 1;\n    }\n\n    grad_offset[index] = val;\n  }\n}\n\n#endif  // DEFORM_CONV_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/deform_roi_pool_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef DEFORM_ROI_POOL_MUSA_KERNEL_MUH\n#define DEFORM_ROI_POOL_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\ntemplate <typename T>\n__global__ void deform_roi_pool_forward_musa_kernel(\n    const int nthreads, const T* input, const T* rois, const T* offset,\n    T* output, const int pooled_height, const int pooled_width,\n    const T spatial_scale, const int sampling_ratio, const T gamma,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;\n    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;\n    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;\n    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    // Compute roi offset\n    if (offset != NULL) {\n      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +\n                              ph * pooled_width + pw;\n      T offset_roi_w = gamma * roi_width * offset_cur_w[0];\n      T offset_roi_h =\n          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];\n      roi_start_w += offset_roi_w;\n      roi_start_h += offset_roi_h;\n    }\n\n    // We do average pooling inside a bin\n    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);\n    T output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T y = roi_start_h + ph * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = roi_start_w + pw * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n        T val = bilinear_interpolate(offset_input, height, width, y, x, index);\n        output_val += val;\n      }\n    }\n    output[index] = output_val / count;\n  }\n}\n\ntemplate <typename T>\n__global__ void deform_roi_pool_backward_musa_kernel(\n    const int nthreads, const T* grad_output, const T* input, const T* rois,\n    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int sampling_ratio,\n    const T gamma, const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    const T* offset_input =\n        input + ((roi_batch_ind * channels + c) * height * width);\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;\n    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;\n    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;\n    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    // Compute roi offset\n    if (offset != NULL) {\n      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +\n                              ph * pooled_width + pw;\n      T offset_roi_w = gamma * roi_width * offset_cur_w[0];\n      T offset_roi_h =\n          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];\n      roi_start_w += offset_roi_w;\n      roi_start_h += offset_roi_h;\n    }\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n    const T grad_output_this_bin = grad_output[index] / count;\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T y = roi_start_h + ph * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = roi_start_w + pw * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_grad_input + y_low * width + x_low,\n                    grad_output_this_bin * w1);\n          atomicAdd(offset_grad_input + y_low * width + x_high,\n                    grad_output_this_bin * w2);\n          atomicAdd(offset_grad_input + y_high * width + x_low,\n                    grad_output_this_bin * w3);\n          atomicAdd(offset_grad_input + y_high * width + x_high,\n                    grad_output_this_bin * w4);\n          if (offset != NULL) {\n            T input_00 = offset_input[y_low * width + x_low];\n            T input_10 = offset_input[y_low * width + x_high];\n            T input_01 = offset_input[y_high * width + x_low];\n            T input_11 = offset_input[y_high * width + x_high];\n            T ogx = gamma * roi_width * grad_output_this_bin *\n                    (input_11 * (y - y_low) + input_10 * (y_high - y) +\n                     input_01 * (y_low - y) + input_00 * (y - y_high));\n            T ogy = gamma * roi_height * grad_output_this_bin *\n                    (input_11 * (x - x_low) + input_01 * (x_high - x) +\n                     input_10 * (x_low - x) + input_00 * (x - x_high));\n            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +\n                          ph * pooled_width + pw,\n                      ogx);\n            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +\n                          pooled_width * pooled_height + ph * pooled_width + pw,\n                      ogy);\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // DEFORM_ROI_POOL_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/diff_iou_rotated_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Adapted from\n// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa\n#include \"pytorch_musa_helper.hpp\"\n\n#define MAX_NUM_VERT_IDX 9\n#define INTERSECTION_OFFSET 8\n#define EPSILON 1e-8\n\ninline int opt_n_thread(int work_size) {\n  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);\n  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);\n}\n\n/*\ncompare normalized vertices (vertices around (0,0))\nif vertex1 < vertex2 return true.\norder: minimum at x-aixs, become larger in anti-clockwise direction\n*/\n__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {\n  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)\n    return false;  // if equal, return false\n\n  if (y1 > 0 && y2 < 0) return true;\n  if (y1 < 0 && y2 > 0) return false;\n\n  float n1 = x1 * x1 + y1 * y1 + EPSILON;\n  float n2 = x2 * x2 + y2 * y2 + EPSILON;\n  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;\n\n  if (y1 > 0 && y2 > 0) {\n    if (diff > EPSILON)\n      return true;\n    else\n      return false;\n  }\n  if (y1 < 0 && y2 < 0) {\n    if (diff < EPSILON)\n      return true;\n    else\n      return false;\n  }\n  return false;\n}\n\n__global__ void diff_iou_rotated_sort_vertices_forward_musa_kernel(\n    int b, int n, int m, const float *__restrict__ vertices,\n    const bool *__restrict__ mask, const int *__restrict__ num_valid,\n    int *__restrict__ idx) {\n  int batch_idx = blockIdx.x;\n  vertices += batch_idx * n * m * 2;\n  mask += batch_idx * n * m;\n  num_valid += batch_idx * n;\n  idx += batch_idx * n * MAX_NUM_VERT_IDX;\n\n  int index = threadIdx.x;  // index of polygon\n  int stride = blockDim.x;\n  for (int i = index; i < n; i += stride) {\n    int pad;  // index of arbitrary invalid intersection point (not box corner!)\n    for (int j = INTERSECTION_OFFSET; j < m; ++j) {\n      if (!mask[i * m + j]) {\n        pad = j;\n        break;\n      }\n    }\n    if (num_valid[i] < 3) {\n      // not enough vertices, take an invalid intersection point\n      // (zero padding)\n      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {\n        idx[i * MAX_NUM_VERT_IDX + j] = pad;\n      }\n    } else {\n      // sort the valid vertices\n      // note the number of valid vertices is known\n      // note: check that num_valid[i] < MAX_NUM_VERT_IDX\n      for (int j = 0; j < num_valid[i]; ++j) {\n        // initialize with a \"big\" value\n        float x_min = 1;\n        float y_min = -EPSILON;\n        int i_take = 0;\n        int i2;\n        float x2, y2;\n        if (j != 0) {\n          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];\n          x2 = vertices[i * m * 2 + i2 * 2 + 0];\n          y2 = vertices[i * m * 2 + i2 * 2 + 1];\n        }\n        for (int k = 0; k < m; ++k) {\n          float x = vertices[i * m * 2 + k * 2 + 0];\n          float y = vertices[i * m * 2 + k * 2 + 1];\n          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {\n            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {\n              x_min = x;\n              y_min = y;\n              i_take = k;\n            }\n          }\n        }\n        idx[i * MAX_NUM_VERT_IDX + j] = i_take;\n      }\n      // duplicate the first idx\n      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];\n\n      // pad zeros\n      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {\n        idx[i * MAX_NUM_VERT_IDX + j] = pad;\n      }\n\n      // for corner case: the two boxes are exactly the same.\n      // in this case, idx would have duplicate elements, which makes the\n      // shoelace formula broken because of the definition, the duplicate\n      // elements only appear in the first 8 positions (they are \"corners in\n      // box\", not \"intersection of edges\")\n      if (num_valid[i] == 8) {\n        int counter = 0;\n        for (int j = 0; j < 4; ++j) {\n          int check = idx[i * MAX_NUM_VERT_IDX + j];\n          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {\n            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;\n          }\n        }\n        if (counter == 4) {\n          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];\n          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {\n            idx[i * MAX_NUM_VERT_IDX + j] = pad;\n          }\n        }\n      }\n\n      // TODO: still might need to cover some other corner cases :(\n    }\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/furthest_point_sample_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH\n#define FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,\n                         int idx1, int idx2) {\n  const float v1 = dists[idx1], v2 = dists[idx2];\n  const int i1 = dists_i[idx1], i2 = dists_i[idx2];\n  dists[idx1] = max(v1, v2);\n  dists_i[idx1] = v2 > v1 ? i2 : i1;\n}\n\ntemplate <unsigned int block_size>\n__global__ void furthest_point_sampling_forward_musa_kernel(\n    int b, int n, int m, const float *__restrict__ dataset,\n    float *__restrict__ temp, int *__restrict__ idxs) {\n  // dataset: (B, N, 3)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  if (m <= 0) return;\n  __shared__ float dists[block_size];\n  __shared__ int dists_i[block_size];\n\n  int batch_index = blockIdx.x;\n  dataset += batch_index * n * 3;\n  temp += batch_index * n;\n  idxs += batch_index * m;\n\n  int tid = threadIdx.x;\n  const int stride = block_size;\n\n  int old = 0;\n  if (threadIdx.x == 0) idxs[0] = old;\n\n  __syncthreads();\n  for (int j = 1; j < m; j++) {\n    int besti = 0;\n    float best = -1;\n    float x1 = dataset[old * 3 + 0];\n    float y1 = dataset[old * 3 + 1];\n    float z1 = dataset[old * 3 + 2];\n    for (int k = tid; k < n; k += stride) {\n      float x2, y2, z2;\n      x2 = dataset[k * 3 + 0];\n      y2 = dataset[k * 3 + 1];\n      z2 = dataset[k * 3 + 2];\n      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);\n      // if (mag <= 1e-3)\n      // continue;\n\n      float d =\n          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);\n      float d2 = min(d, temp[k]);\n      temp[k] = d2;\n      besti = d2 > best ? k : besti;\n      best = d2 > best ? d2 : best;\n    }\n    dists[tid] = best;\n    dists_i[tid] = besti;\n    __syncthreads();\n\n#pragma unroll\n    for (int block_size_thres = 1024; block_size_thres >= 2;\n         block_size_thres >>= 1) {\n      const int tid_thres = block_size_thres / 2;\n      if (block_size >= block_size_thres && tid < tid_thres) {\n        __update(dists, dists_i, tid, tid + tid_thres);\n      }\n      __syncthreads();\n    }\n\n    old = dists_i[0];\n    if (tid == 0) idxs[j] = old;\n  }\n}\n\n// Modified from\n// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu\ntemplate <unsigned int block_size>\n__global__ void furthest_point_sampling_with_dist_forward_musa_kernel(\n    int b, int n, int m, const float *__restrict__ dataset,\n    float *__restrict__ temp, int *__restrict__ idxs) {\n  // dataset: (B, N, N)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  if (m <= 0) return;\n  __shared__ float dists[block_size];\n  __shared__ int dists_i[block_size];\n\n  int batch_index = blockIdx.x;\n  dataset += batch_index * n * n;\n  temp += batch_index * n;\n  idxs += batch_index * m;\n\n  int tid = threadIdx.x;\n  const int stride = block_size;\n\n  int old = 0;\n  if (threadIdx.x == 0) idxs[0] = old;\n\n  __syncthreads();\n  for (int j = 1; j < m; j++) {\n    int besti = 0;\n    float best = -1;\n    // float x1 = dataset[old * 3 + 0];\n    // float y1 = dataset[old * 3 + 1];\n    // float z1 = dataset[old * 3 + 2];\n    for (int k = tid; k < n; k += stride) {\n      // float x2, y2, z2;\n      // x2 = dataset[k * 3 + 0];\n      // y2 = dataset[k * 3 + 1];\n      // z2 = dataset[k * 3 + 2];\n\n      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *\n      // (z2 - z1);\n      float d = dataset[old * n + k];\n\n      float d2 = min(d, temp[k]);\n      temp[k] = d2;\n      besti = d2 > best ? k : besti;\n      best = d2 > best ? d2 : best;\n    }\n    dists[tid] = best;\n    dists_i[tid] = besti;\n    __syncthreads();\n\n#pragma unroll\n    for (int block_size_thres = 1024; block_size_thres >= 2;\n         block_size_thres >>= 1) {\n      const int tid_thres = block_size_thres / 2;\n      if (block_size >= block_size_thres && tid < tid_thres) {\n        __update(dists, dists_i, tid, tid + tid_thres);\n      }\n      __syncthreads();\n    }\n\n    old = dists_i[0];\n    if (tid == 0) idxs[j] = old;\n  }\n}\n\n#endif  // FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/gather_points_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef GATHER_POINTS_MUSA_KERNEL_MUH\n#define GATHER_POINTS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n#define TOTAL_THREADS 1024\n\ntemplate <typename T>\n__global__ void gather_points_forward_musa_kernel(int b, int c, int n, int m,\n                                                  const T *points,\n                                                  const int *__restrict__ idx,\n                                                  T *out) {\n  // points: (B, C, N)\n  // idx: (B, M)\n  // output:\n  //      out: (B, C, M)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    out += bs_idx * c * m + c_idx * m + pt_idx;\n    idx += bs_idx * m + pt_idx;\n    points += bs_idx * c * n + c_idx * n;\n    out[0] = points[idx[0]];\n  }\n}\n\ntemplate <typename T>\n__global__ void gather_points_backward_musa_kernel(int b, int c, int n, int m,\n                                                   const T *grad_out,\n                                                   const int *__restrict__ idx,\n                                                   T *grad_points) {\n  // grad_out: (B, C, M)\n  // idx: (B, M)\n  // output:\n  //      grad_points: (B, C, N)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    grad_out += bs_idx * c * m + c_idx * m + pt_idx;\n    idx += bs_idx * m + pt_idx;\n    grad_points += bs_idx * c * n + c_idx * n;\n\n    atomicAdd(grad_points + idx[0], grad_out[0]);\n  }\n}\n\n#endif  // GATHER_POINTS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/group_points_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#ifndef GROUP_POINTS_MUSA_KERNEL_MUH\n#define GROUP_POINTS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void group_points_forward_musa_kernel(int b, int c, int n,\n                                                 int npoints, int nsample,\n                                                 const T *points,\n                                                 const int *__restrict__ idx,\n                                                 T *out) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(index, npoints * nsample) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    int pt_idx = index / nsample;\n    int sample_idx = index % nsample;\n\n    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;\n    int in_idx = bs_idx * c * n + c_idx * n + idx[0];\n    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +\n                  pt_idx * nsample + sample_idx;\n\n    out[out_idx] = points[in_idx];\n  }\n}\n\ntemplate <typename T>\n__global__ void group_points_backward_musa_kernel(int b, int c, int n,\n                                                  int npoints, int nsample,\n                                                  const T *grad_out,\n                                                  const int *__restrict__ idx,\n                                                  T *grad_points) {\n  // grad_out: (B, C, npoints, nsample)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      grad_points: (B, C, N)\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(index, npoints * nsample) {\n    int pt_idx = index / nsample;\n    if (bs_idx >= b || c_idx >= c) return;\n\n    int sample_idx = index % nsample;\n    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +\n                pt_idx * nsample + sample_idx;\n    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;\n\n    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);\n  }\n}\n\n#endif  // GROUP_POINTS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/iou3d_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef IOU3D_MUSA_KERNEL_MUH\n#define IOU3D_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\nconst int THREADS_PER_BLOCK_IOU3D = 16;\nconst int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;\n__device__ const float EPS = 1e-8;\n\nstruct Point {\n  float x, y;\n  __device__ Point() {}\n  __device__ Point(double _x, double _y) { x = _x, y = _y; }\n\n  __device__ void set(float _x, float _y) {\n    x = _x;\n    y = _y;\n  }\n\n  __device__ Point operator+(const Point &b) const {\n    return Point(x + b.x, y + b.y);\n  }\n\n  __device__ Point operator-(const Point &b) const {\n    return Point(x - b.x, y - b.y);\n  }\n};\n\n__device__ inline float cross(const Point &a, const Point &b) {\n  return a.x * b.y - a.y * b.x;\n}\n\n__device__ inline float cross(const Point &p1, const Point &p2,\n                              const Point &p0) {\n  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);\n}\n\n__device__ int check_rect_cross(const Point &p1, const Point &p2,\n                                const Point &q1, const Point &q2) {\n  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&\n            min(q1.x, q2.x) <= max(p1.x, p2.x) &&\n            min(p1.y, p2.y) <= max(q1.y, q2.y) &&\n            min(q1.y, q2.y) <= max(p1.y, p2.y);\n  return ret;\n}\n\n__device__ inline int check_in_box2d(const float *box, const Point &p) {\n  // params: box (7) [x, y, z, dx, dy, dz, heading]\n  const float MARGIN = 1e-2;\n\n  float center_x = box[0], center_y = box[1];\n  // rotate the point in the opposite direction of box\n  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);\n  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);\n  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;\n\n  return (fabs(rot_x) < box[3] / 2 + MARGIN &&\n          fabs(rot_y) < box[4] / 2 + MARGIN);\n}\n\n__device__ inline int intersection(const Point &p1, const Point &p0,\n                                   const Point &q1, const Point &q0,\n                                   Point &ans_point) {\n  // fast exclusion\n  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;\n\n  // check cross standing\n  float s1 = cross(q0, p1, p0);\n  float s2 = cross(p1, q1, p0);\n  float s3 = cross(p0, q1, q0);\n  float s4 = cross(q1, p1, q0);\n\n  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;\n\n  // calculate intersection of two lines\n  float s5 = cross(q1, p1, p0);\n  if (fabs(s5 - s1) > EPS) {\n    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);\n    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);\n\n  } else {\n    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;\n    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;\n    float D = a0 * b1 - a1 * b0;\n\n    ans_point.x = (b0 * c1 - b1 * c0) / D;\n    ans_point.y = (a1 * c0 - a0 * c1) / D;\n  }\n\n  return 1;\n}\n\n__device__ inline void rotate_around_center(const Point &center,\n                                            const float angle_cos,\n                                            const float angle_sin, Point &p) {\n  float new_x =\n      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;\n  float new_y =\n      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;\n  p.set(new_x, new_y);\n}\n\n__device__ inline int point_cmp(const Point &a, const Point &b,\n                                const Point &center) {\n  return atan2(a.y - center.y, a.x - center.x) >\n         atan2(b.y - center.y, b.x - center.x);\n}\n\n__device__ inline float box_overlap(const float *box_a, const float *box_b) {\n  // params box_a: [x, y, z, dx, dy, dz, heading]\n  // params box_b: [x, y, z, dx, dy, dz, heading]\n\n  float a_angle = box_a[6], b_angle = box_b[6];\n  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,\n        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;\n  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;\n  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;\n  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;\n  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;\n\n  Point center_a(box_a[0], box_a[1]);\n  Point center_b(box_b[0], box_b[1]);\n\n  Point box_a_corners[5];\n  box_a_corners[0].set(a_x1, a_y1);\n  box_a_corners[1].set(a_x2, a_y1);\n  box_a_corners[2].set(a_x2, a_y2);\n  box_a_corners[3].set(a_x1, a_y2);\n\n  Point box_b_corners[5];\n  box_b_corners[0].set(b_x1, b_y1);\n  box_b_corners[1].set(b_x2, b_y1);\n  box_b_corners[2].set(b_x2, b_y2);\n  box_b_corners[3].set(b_x1, b_y2);\n\n  // get oriented corners\n  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);\n  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);\n\n  for (int k = 0; k < 4; k++) {\n    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);\n    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);\n  }\n\n  box_a_corners[4] = box_a_corners[0];\n  box_b_corners[4] = box_b_corners[0];\n\n  // get intersection of lines\n  Point cross_points[16];\n  Point poly_center;\n  int cnt = 0, flag = 0;\n\n  poly_center.set(0, 0);\n  for (int i = 0; i < 4; i++) {\n    for (int j = 0; j < 4; j++) {\n      flag = intersection(box_a_corners[i + 1], box_a_corners[i],\n                          box_b_corners[j + 1], box_b_corners[j],\n                          cross_points[cnt]);\n      if (flag) {\n        poly_center = poly_center + cross_points[cnt];\n        cnt++;\n      }\n    }\n  }\n\n  // check corners\n  for (int k = 0; k < 4; k++) {\n    if (check_in_box2d(box_a, box_b_corners[k])) {\n      poly_center = poly_center + box_b_corners[k];\n      cross_points[cnt] = box_b_corners[k];\n      cnt++;\n    }\n    if (check_in_box2d(box_b, box_a_corners[k])) {\n      poly_center = poly_center + box_a_corners[k];\n      cross_points[cnt] = box_a_corners[k];\n      cnt++;\n    }\n  }\n\n  poly_center.x /= cnt;\n  poly_center.y /= cnt;\n\n  // sort the points of polygon\n  Point temp;\n  for (int j = 0; j < cnt - 1; j++) {\n    for (int i = 0; i < cnt - j - 1; i++) {\n      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {\n        temp = cross_points[i];\n        cross_points[i] = cross_points[i + 1];\n        cross_points[i + 1] = temp;\n      }\n    }\n  }\n\n  // get the overlap areas\n  float area = 0;\n  for (int k = 0; k < cnt - 1; k++) {\n    area += cross(cross_points[k] - cross_points[0],\n                  cross_points[k + 1] - cross_points[0]);\n  }\n\n  return fabs(area) / 2.0;\n}\n\n__device__ inline float iou_bev(const float *box_a, const float *box_b) {\n  // params box_a: [x, y, z, dx, dy, dz, heading]\n  // params box_b: [x, y, z, dx, dy, dz, heading]\n  float sa = box_a[3] * box_a[4];\n  float sb = box_b[3] * box_b[4];\n  float s_overlap = box_overlap(box_a, box_b);\n  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);\n}\n\n__global__ void iou3d_boxes_overlap_bev_forward_musa_kernel(\n    const int num_a, const float *boxes_a, const int num_b,\n    const float *boxes_b, float *ans_overlap) {\n  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]\n  MUSA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {\n    if (a_idx >= num_a || b_idx >= num_b) {\n      return;\n    }\n\n    const float *cur_box_a = boxes_a + a_idx * 7;\n    const float *cur_box_b = boxes_b + b_idx * 7;\n    float cur_overlap = box_overlap(cur_box_a, cur_box_b);\n    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;\n  }\n}\n\n__global__ void iou3d_nms3d_forward_musa_kernel(const int boxes_num,\n                                                const float nms_overlap_thresh,\n                                                const float *boxes,\n                                                unsigned long long *mask) {\n  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params: mask (N, N/THREADS_PER_BLOCK_NMS)\n  const int blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    // if (row_start > col_start) return;\n\n    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n\n    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];\n\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 7 + 0] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];\n      block_boxes[threadIdx.x * 7 + 1] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];\n      block_boxes[threadIdx.x * 7 + 2] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];\n      block_boxes[threadIdx.x * 7 + 3] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];\n      block_boxes[threadIdx.x * 7 + 4] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];\n      block_boxes[threadIdx.x * 7 + 5] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];\n      block_boxes[threadIdx.x * 7 + 6] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;\n      const float *cur_box = boxes + cur_box_idx * 7;\n\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks =\n          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n      mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n__device__ inline float iou_normal(float const *const a, float const *const b) {\n  // params: a: [x, y, z, dx, dy, dz, heading]\n  // params: b: [x, y, z, dx, dy, dz, heading]\n\n  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),\n        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);\n  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),\n        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);\n  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);\n  float interS = width * height;\n  float Sa = a[3] * a[4];\n  float Sb = b[3] * b[4];\n  return interS / fmaxf(Sa + Sb - interS, EPS);\n}\n\n__global__ void iou3d_nms3d_normal_forward_musa_kernel(\n    const int boxes_num, const float nms_overlap_thresh, const float *boxes,\n    unsigned long long *mask) {\n  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params: mask (N, N/THREADS_PER_BLOCK_NMS)\n\n  const int blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    // if (row_start > col_start) return;\n\n    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,\n                               THREADS_PER_BLOCK_NMS);\n\n    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];\n\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 7 + 0] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];\n      block_boxes[threadIdx.x * 7 + 1] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];\n      block_boxes[threadIdx.x * 7 + 2] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];\n      block_boxes[threadIdx.x * 7 + 3] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];\n      block_boxes[threadIdx.x * 7 + 4] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];\n      block_boxes[threadIdx.x * 7 + 5] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];\n      block_boxes[threadIdx.x * 7 + 6] =\n          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;\n      const float *cur_box = boxes + cur_box_idx * 7;\n\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks =\n          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n      mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif  // IOU3D_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/knn_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n#ifndef KNN_MUSA_KERNEL_MUH\n#define KNN_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\ninline __device__ void swap_float(float *x, float *y) {\n  float tmp = *x;\n  *x = *y;\n  *y = tmp;\n}\n\ninline __device__ void swap_int(int *x, int *y) {\n  int tmp = *x;\n  *x = *y;\n  *y = tmp;\n}\n\n__device__ void reheap(float *dist, int *idx, int k) {\n  int root = 0;\n  int child = root * 2 + 1;\n  while (child < k) {\n    if (child + 1 < k && dist[child + 1] > dist[child]) child++;\n    if (dist[root] > dist[child]) return;\n    swap_float(&dist[root], &dist[child]);\n    swap_int(&idx[root], &idx[child]);\n    root = child;\n    child = root * 2 + 1;\n  }\n}\n\n__device__ void heap_sort(float *dist, int *idx, int k) {\n  int i;\n  for (i = k - 1; i > 0; i--) {\n    swap_float(&dist[0], &dist[i]);\n    swap_int(&idx[0], &idx[i]);\n    reheap(dist, idx, i);\n  }\n}\n\n// input: xyz (b, n, 3) new_xyz (b, m, 3)\n// output: idx (b, m, nsample) dist2 (b, m, nsample)\ntemplate <typename T>\n__global__ void knn_forward_musa_kernel(int b, int n, int m, int nsample,\n                                        const T *xyz, const T *new_xyz,\n                                        int *__restrict__ idx, T *dist2) {\n  int bs_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, m) {\n    if (bs_idx >= b) return;\n\n    new_xyz += bs_idx * m * 3 + pt_idx * 3;\n    xyz += bs_idx * n * 3;\n    idx += bs_idx * m * nsample + pt_idx * nsample;\n    dist2 += bs_idx * m * nsample + pt_idx * nsample;\n\n    T new_x = new_xyz[0];\n    T new_y = new_xyz[1];\n    T new_z = new_xyz[2];\n\n    float best_dist[100];\n    int best_idx[100];\n    for (int i = 0; i < nsample; i++) {\n      best_dist[i] = 1e10;\n      best_idx[i] = 0;\n    }\n    for (int i = 0; i < n; i++) {\n      T x = xyz[i * 3 + 0];\n      T y = xyz[i * 3 + 1];\n      T z = xyz[i * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 < best_dist[0]) {\n        best_dist[0] = d2;\n        best_idx[0] = i;\n        reheap(best_dist, best_idx, nsample);\n      }\n    }\n    heap_sort(best_dist, best_idx, nsample);\n    for (int i = 0; i < nsample; i++) {\n      idx[i] = best_idx[i];\n      dist2[i] = best_dist[i];\n    }\n  }\n}\n\n#endif  // KNN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/masked_conv2d_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MASKED_CONV2D_MUSA_KERNEL_MUH\n#define MASKED_CONV2D_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename scalar_t>\n__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,\n                                    const int height, const int width,\n                                    const int kernel_h, const int kernel_w,\n                                    const int pad_h, const int pad_w,\n                                    const int64_t *mask_h_idx,\n                                    const int64_t *mask_w_idx,\n                                    const int mask_cnt, scalar_t *data_col) {\n  // mask_cnt * channels\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    const int m_index = index % mask_cnt;\n    const int h_col = mask_h_idx[m_index];\n    const int w_col = mask_w_idx[m_index];\n    const int c_im = index / mask_cnt;\n    const int c_col = c_im * kernel_h * kernel_w;\n    const int h_offset = h_col - pad_h;\n    const int w_offset = w_col - pad_w;\n    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;\n    for (int i = 0; i < kernel_h; ++i) {\n      int h_im = h_offset + i;\n      for (int j = 0; j < kernel_w; ++j) {\n        int w_im = w_offset + j;\n        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {\n          *data_col_ptr =\n              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];\n        } else {\n          *data_col_ptr = 0.0;\n        }\n        data_col_ptr += mask_cnt;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,\n                                    const int height, const int width,\n                                    const int channels,\n                                    const int64_t *mask_h_idx,\n                                    const int64_t *mask_w_idx,\n                                    const int mask_cnt, scalar_t *data_im) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    const int m_index = index % mask_cnt;\n    const int h_im = mask_h_idx[m_index];\n    const int w_im = mask_w_idx[m_index];\n    const int c_im = index / mask_cnt;\n    // compute the start and end of the output\n    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];\n  }\n}\n\n#endif  // MASKED_CONV2D_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/min_area_polygons_musa.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MIN_AREA_POLYGONS_MUSA_KERNEL_MUH\n#define MIN_AREA_POLYGONS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n#define MAXN 20\n__device__ const float PI = 3.1415926;\n\nstruct Point {\n  float x, y;\n  __device__ Point() {}\n  __device__ Point(float x, float y) : x(x), y(y) {}\n};\n\n__device__ inline void swap1(Point *a, Point *b) {\n  Point temp;\n  temp.x = a->x;\n  temp.y = a->y;\n\n  a->x = b->x;\n  a->y = b->y;\n\n  b->x = temp.x;\n  b->y = temp.y;\n}\n__device__ inline float cross(Point o, Point a, Point b) {\n  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);\n}\n\n__device__ inline float dis(Point a, Point b) {\n  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);\n}\n__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {\n  float convex_points[2][MAXN];\n  for (int j = 0; j < n_points; j++) {\n    convex_points[0][j] = ps[j].x;\n  }\n  for (int j = 0; j < n_points; j++) {\n    convex_points[1][j] = ps[j].y;\n  }\n\n  Point edges[MAXN];\n  float edges_angles[MAXN];\n  float unique_angles[MAXN];\n  int n_edges = n_points - 1;\n  int n_unique = 0;\n  int unique_flag = 0;\n\n  for (int i = 0; i < n_edges; i++) {\n    edges[i].x = ps[i + 1].x - ps[i].x;\n    edges[i].y = ps[i + 1].y - ps[i].y;\n  }\n  for (int i = 0; i < n_edges; i++) {\n    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);\n    if (edges_angles[i] >= 0) {\n      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);\n    } else {\n      edges_angles[i] =\n          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);\n    }\n  }\n  unique_angles[0] = edges_angles[0];\n  n_unique += 1;\n  for (int i = 1; i < n_edges; i++) {\n    for (int j = 0; j < n_unique; j++) {\n      if (edges_angles[i] == unique_angles[j]) {\n        unique_flag += 1;\n      }\n    }\n    if (unique_flag == 0) {\n      unique_angles[n_unique] = edges_angles[i];\n      n_unique += 1;\n      unique_flag = 0;\n    } else {\n      unique_flag = 0;\n    }\n  }\n\n  float minarea = 1e12;\n  for (int i = 0; i < n_unique; i++) {\n    float R[2][2];\n    float rot_points[2][MAXN];\n    R[0][0] = cos(unique_angles[i]);\n    R[0][1] = sin(unique_angles[i]);\n    R[1][0] = -sin(unique_angles[i]);\n    R[1][1] = cos(unique_angles[i]);\n    // R x Points\n    for (int m = 0; m < 2; m++) {\n      for (int n = 0; n < n_points; n++) {\n        float sum = 0.0;\n        for (int k = 0; k < 2; k++) {\n          sum = sum + R[m][k] * convex_points[k][n];\n        }\n        rot_points[m][n] = sum;\n      }\n    }\n\n    // xmin;\n    float xmin, ymin, xmax, ymax;\n    xmin = 1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {\n        continue;\n      } else {\n        if (rot_points[0][j] < xmin) {\n          xmin = rot_points[0][j];\n        }\n      }\n    }\n    // ymin\n    ymin = 1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {\n        continue;\n      } else {\n        if (rot_points[1][j] < ymin) {\n          ymin = rot_points[1][j];\n        }\n      }\n    }\n    // xmax\n    xmax = -1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {\n        continue;\n      } else {\n        if (rot_points[0][j] > xmax) {\n          xmax = rot_points[0][j];\n        }\n      }\n    }\n    // ymax\n    ymax = -1e12;\n    for (int j = 0; j < n_points; j++) {\n      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {\n        continue;\n      } else {\n        if (rot_points[1][j] > ymax) {\n          ymax = rot_points[1][j];\n        }\n      }\n    }\n    float area = (xmax - xmin) * (ymax - ymin);\n    if (area < minarea) {\n      minarea = area;\n      minbox[0] = unique_angles[i];\n      minbox[1] = xmin;\n      minbox[2] = ymin;\n      minbox[3] = xmax;\n      minbox[4] = ymax;\n    }\n  }\n}\n\n// convex_find\n__device__ inline void Jarvis(Point *in_poly, int &n_poly) {\n  int n_input = n_poly;\n  Point input_poly[20];\n  for (int i = 0; i < n_input; i++) {\n    input_poly[i].x = in_poly[i].x;\n    input_poly[i].y = in_poly[i].y;\n  }\n  Point p_max, p_k;\n  int max_index, k_index;\n  int Stack[20], top1, top2;\n  // float sign;\n  double sign;\n  Point right_point[10], left_point[10];\n\n  for (int i = 0; i < n_poly; i++) {\n    if (in_poly[i].y < in_poly[0].y ||\n        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {\n      Point *j = &(in_poly[0]);\n      Point *k = &(in_poly[i]);\n      swap1(j, k);\n    }\n    if (i == 0) {\n      p_max = in_poly[0];\n      max_index = 0;\n    }\n    if (in_poly[i].y > p_max.y ||\n        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {\n      p_max = in_poly[i];\n      max_index = i;\n    }\n  }\n  if (max_index == 0) {\n    max_index = 1;\n    p_max = in_poly[max_index];\n  }\n\n  k_index = 0, Stack[0] = 0, top1 = 0;\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);\n      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >\n                                         dis(in_poly[Stack[top1]], p_k)))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top1++;\n    Stack[top1] = k_index;\n  }\n\n  for (int i = 0; i <= top1; i++) {\n    right_point[i] = in_poly[Stack[i]];\n  }\n\n  k_index = 0, Stack[0] = 0, top2 = 0;\n\n  while (k_index != max_index) {\n    p_k = p_max;\n    k_index = max_index;\n    for (int i = 1; i < n_poly; i++) {\n      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);\n      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >\n                                        dis(in_poly[Stack[top2]], p_k))) {\n        p_k = in_poly[i];\n        k_index = i;\n      }\n    }\n    top2++;\n    Stack[top2] = k_index;\n  }\n\n  for (int i = top2 - 1; i >= 0; i--) {\n    left_point[i] = in_poly[Stack[i]];\n  }\n\n  for (int i = 0; i < top1 + top2; i++) {\n    if (i <= top1) {\n      in_poly[i] = right_point[i];\n    } else {\n      in_poly[i] = left_point[top2 - (i - top1)];\n    }\n  }\n  n_poly = top1 + top2;\n}\n\ntemplate <typename T>\n__device__ inline void Findminbox(T const *const p, T *minpoints) {\n  Point ps1[MAXN];\n  Point convex[MAXN];\n  for (int i = 0; i < 9; i++) {\n    convex[i].x = p[i * 2];\n    convex[i].y = p[i * 2 + 1];\n  }\n  int n_convex = 9;\n  Jarvis(convex, n_convex);\n  int n1 = n_convex;\n  for (int i = 0; i < n1; i++) {\n    ps1[i].x = convex[i].x;\n    ps1[i].y = convex[i].y;\n  }\n  ps1[n1].x = convex[0].x;\n  ps1[n1].y = convex[0].y;\n\n  float minbbox[5] = {0};\n  minBoundingRect(ps1, n1 + 1, minbbox);\n  float angle = minbbox[0];\n  float xmin = minbbox[1];\n  float ymin = minbbox[2];\n  float xmax = minbbox[3];\n  float ymax = minbbox[4];\n  float R[2][2];\n\n  R[0][0] = cos(angle);\n  R[0][1] = sin(angle);\n  R[1][0] = -sin(angle);\n  R[1][1] = cos(angle);\n\n  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];\n  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];\n  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];\n  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];\n  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];\n  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];\n  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];\n  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];\n}\n\ntemplate <typename T>\n__global__ void min_area_polygons_musa_kernel(const int ex_n_boxes,\n                                              const T *ex_boxes, T *minbox) {\n  MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) {\n    const T *cur_box = ex_boxes + index * 18;\n    T *cur_min_box = minbox + index * 8;\n    Findminbox(cur_box, cur_min_box);\n  }\n}\n\n#endif  // MIN_AREA_POLYGONS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/modulated_deform_conv_musa_kernel.muh",
    "content": "/*!\n ******************* BEGIN Caffe Copyright Notice and Disclaimer\n *****************\n *\n * COPYRIGHT\n *\n * All contributions by the University of California:\n * Copyright (c) 2014-2017 The Regents of the University of California (Regents)\n * All rights reserved.\n *\n * All other contributions:\n * Copyright (c) 2014-2017, the respective contributors\n * All rights reserved.\n *\n * Caffe uses a shared copyright model: each contributor holds copyright over\n * their contributions to Caffe. The project versioning records all such\n * contribution and copyright details. If a contributor wants to further mark\n * their specific copyright on a particular contribution, they should indicate\n * their copyright solely in the commit message of the change when it is\n * committed.\n *\n * LICENSE\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions are met:\n *\n * 1. Redistributions of source code must retain the above copyright notice,\n *this list of conditions and the following disclaimer.\n * 2. Redistributions in binary form must reproduce the above copyright notice,\n * this list of conditions and the following disclaimer in the documentation\n * and/or other materials provided with the distribution.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE\n *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n * CONTRIBUTION AGREEMENT\n *\n * By contributing to the BVLC/caffe repository through pull-request, comment,\n * or otherwise, the contributor releases their content to the\n * license and copyright terms herein.\n *\n ***************** END Caffe Copyright Notice and Disclaimer\n *********************\n *\n * Copyright (c) 2018 Microsoft\n * Licensed under The MIT License [see LICENSE for details]\n * \\file modulated_deformable_im2col.muh\n * \\brief Function definitions of converting an image to\n * column matrix based on kernel, padding, dilation, and offset.\n * These functions are mainly used in deformable convolution operators.\n * \\ref: https://arxiv.org/abs/1703.06211\n * \\author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng\n */\n\n// modified from\n// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu\n\n#ifndef MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH\n#define MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n\ntemplate <typename T>\n__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,\n                                  const int height, const int width, T h, T w) {\n  int h_low = floorf(h);\n  int w_low = floorf(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\n__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,\n                                      const int w, const int height,\n                                      const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\n__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,\n                                        const int height, const int width,\n                                        const T *im_data, const int data_width,\n                                        const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_im2col_gpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const T *data_mask,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const T *data_mask_ptr =\n        data_mask + (b_col * deformable_group + deformable_group_index) *\n                        kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const int data_mask_hw_ptr =\n            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        const T mask = data_mask_ptr[data_mask_hw_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,\n                                     w_im);\n        *data_col_ptr = val * mask;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_col2im_gpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const T *data_mask,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const int data_mask_hw_ptr =\n        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T mask = data_mask_ptr[data_mask_hw_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index] * mask;\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight =\n              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,\n                                       cur_h + dy, cur_w + dx, height, width);\n          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void modulated_deformable_col2im_coord_gpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const T *data_mask, const int channels, const int height, const int width,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int channel_per_deformable_group,\n    const int batch_size, const int offset_channels, const int deformable_group,\n    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    T val = 0, mval = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const int data_mask_hw_ptr =\n          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      const T mask = data_mask_ptr[data_mask_hw_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      else\n        mval += data_col_ptr[col_pos] *\n                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,\n                                     height, width, inv_h, inv_w);\n      const T weight = dmcn_get_coordinate_weight(\n          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,\n          width, bp_dir);\n      val += weight * data_col_ptr[col_pos] * mask;\n      cnt += 1;\n    }\n    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);\n    grad_offset[index] = val;\n    if (offset_c % 2 == 0)\n      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +\n      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *\n      // height_col + h) * width_col + w], mask_req, mval);\n      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *\n                      kernel_w +\n                  offset_c / 2) *\n                     height_col +\n                 h) *\n                    width_col +\n                w] = mval;\n  }\n}\n\n#endif  // MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/ms_deform_attn_musa_kernel.muh",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n#ifndef DEFORM_ATTN_MUSA_KERNEL\n#define DEFORM_ATTN_MUSA_KERNEL\n\n#include \"common_musa_helper.hpp\"\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename scalar_t>\n__device__ scalar_t ms_deform_attn_im2col_bilinear(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n  }\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,\n    const scalar_t &attn_weight, scalar_t *&grad_value,\n    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  *grad_attn_weight = top_grad * val;\n  *grad_sampling_loc = width * grad_w_weight * top_grad_value;\n  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;\n}\n\ntemplate <typename scalar_t>\n__device__ void ms_deform_attn_col2im_bilinear_gm(\n    const scalar_t *&bottom_data, const int &height, const int &width,\n    const int &nheads, const int &channels, const scalar_t &h,\n    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,\n    const scalar_t &attn_weight, scalar_t *&grad_value,\n    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {\n  const int h_low = floorf(h);\n  const int w_low = floorf(w);\n  const int h_high = h_low + 1;\n  const int w_high = w_low + 1;\n\n  const scalar_t lh = h - h_low;\n  const scalar_t lw = w - w_low;\n  const scalar_t hh = 1 - lh, hw = 1 - lw;\n\n  const int w_stride = nheads * channels;\n  const int h_stride = width * w_stride;\n  const int h_low_ptr_offset = h_low * h_stride;\n  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;\n  const int w_low_ptr_offset = w_low * w_stride;\n  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;\n  const int base_ptr = m * channels + c;\n\n  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n  const scalar_t top_grad_value = top_grad * attn_weight;\n  scalar_t grad_h_weight = 0, grad_w_weight = 0;\n\n  scalar_t v1 = 0;\n  if (h_low >= 0 && w_low >= 0) {\n    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;\n    v1 = bottom_data[ptr1];\n    grad_h_weight -= hw * v1;\n    grad_w_weight -= hh * v1;\n    atomicAdd(grad_value + ptr1, w1 * top_grad_value);\n  }\n  scalar_t v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1) {\n    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;\n    v2 = bottom_data[ptr2];\n    grad_h_weight -= lw * v2;\n    grad_w_weight += hh * v2;\n    atomicAdd(grad_value + ptr2, w2 * top_grad_value);\n  }\n  scalar_t v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0) {\n    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;\n    v3 = bottom_data[ptr3];\n    grad_h_weight += hw * v3;\n    grad_w_weight -= lh * v3;\n    atomicAdd(grad_value + ptr3, w3 * top_grad_value);\n  }\n  scalar_t v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1) {\n    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;\n    v4 = bottom_data[ptr4];\n    grad_h_weight += lw * v4;\n    grad_w_weight += lh * v4;\n    atomicAdd(grad_value + ptr4, w4 * top_grad_value);\n  }\n\n  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  atomicAdd(grad_attn_weight, top_grad * val);\n  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);\n  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_im2col_gpu_kernel(\n    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,\n    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,\n    const scalar_t *data_attn_weight, const int batch_size,\n    const int spatial_size, const int num_heads, const int channels,\n    const int num_levels, const int num_query, const int num_point,\n    scalar_t *data_col) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    scalar_t *data_col_ptr = data_col + index;\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n    scalar_t col = 0;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const scalar_t *data_value_ptr =\n          data_value +\n          (data_value_ptr_init_offset + level_start_id * qid_stride);\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,\n                                                spatial_w, num_heads, channels,\n                                                h_im, w_im, m_col, c_col) *\n                 weight;\n        }\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n      }\n    }\n    *data_col_ptr = col;\n  }\n}\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n  __shared__ scalar_t cache_grad_attn_weight[blockSize];\n  unsigned int tid = threadIdx.x;\n  const int qid_stride = num_heads * channels;\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          scalar_t _grad_w = cache_grad_sampling_loc[0],\n                   _grad_h = cache_grad_sampling_loc[1],\n                   _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[_tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc_out = _grad_w;\n          *(grad_sampling_loc_out + 1) = _grad_h;\n          *grad_attn_weight_out = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, unsigned int blockSize>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];\n  __shared__ scalar_t cache_grad_attn_weight[blockSize];\n  unsigned int tid = threadIdx.x;\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc_out = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight_out = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n        if (tid == 0) {\n          scalar_t _grad_w = cache_grad_sampling_loc[0],\n                   _grad_h = cache_grad_sampling_loc[1],\n                   _grad_a = cache_grad_attn_weight[0];\n          int sid = 2;\n          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {\n            _grad_w += cache_grad_sampling_loc[sid];\n            _grad_h += cache_grad_sampling_loc[sid + 1];\n            _grad_a += cache_grad_attn_weight[_tid];\n            sid += 2;\n          }\n\n          *grad_sampling_loc_out = _grad_w;\n          *(grad_sampling_loc_out + 1) = _grad_h;\n          *grad_attn_weight_out = _grad_a;\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          *grad_sampling_loc_out = cache_grad_sampling_loc[0];\n          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];\n          *grad_attn_weight_out = cache_grad_attn_weight[0];\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  extern __shared__ int _s[];\n  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);\n  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;\n  unsigned int tid = threadIdx.x;\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;\n        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;\n        *(cache_grad_attn_weight + threadIdx.x) = 0;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              cache_grad_sampling_loc + (threadIdx.x << 1),\n              cache_grad_attn_weight + threadIdx.x);\n        }\n\n        __syncthreads();\n\n        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;\n             s >>= 1, spre >>= 1) {\n          if (tid < s) {\n            const unsigned int xid1 = tid << 1;\n            const unsigned int xid2 = (tid + s) << 1;\n            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];\n            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];\n            cache_grad_sampling_loc[xid1 + 1] +=\n                cache_grad_sampling_loc[xid2 + 1];\n            if (tid + (s << 1) < spre) {\n              cache_grad_attn_weight[tid] +=\n                  cache_grad_attn_weight[tid + (s << 1)];\n              cache_grad_sampling_loc[xid1] +=\n                  cache_grad_sampling_loc[xid2 + (s << 1)];\n              cache_grad_sampling_loc[xid1 + 1] +=\n                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];\n            }\n          }\n          __syncthreads();\n        }\n\n        if (tid == 0) {\n          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);\n          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);\n          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);\n        }\n        __syncthreads();\n\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void ms_deformable_col2im_gpu_kernel_gm(\n    const int n, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  MUSA_1D_KERNEL_LOOP(index, n) {\n    int _temp = index;\n    const int c_col = _temp % channels;\n    _temp /= channels;\n    const int sampling_index = _temp;\n    const int m_col = _temp % num_heads;\n    _temp /= num_heads;\n    _temp /= num_query;\n    const int b_col = _temp;\n\n    const scalar_t top_grad = grad_col[index];\n\n    int data_weight_ptr = sampling_index * num_levels * num_point;\n    int data_loc_w_ptr = data_weight_ptr << 1;\n    const int grad_sampling_ptr = data_weight_ptr;\n    scalar_t *grad_sampling_loc_out =\n        grad_sampling_loc + (grad_sampling_ptr << 1);\n    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;\n    const int grad_weight_stride = 1;\n    const int grad_loc_stride = 2;\n    const int qid_stride = num_heads * channels;\n    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;\n\n    for (int l_col = 0; l_col < num_levels; ++l_col) {\n      const int level_start_id = data_level_start_index[l_col];\n      const int spatial_h_ptr = l_col << 1;\n      const int spatial_h = data_spatial_shapes[spatial_h_ptr];\n      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];\n      const int value_ptr_offset =\n          data_value_ptr_init_offset + level_start_id * qid_stride;\n      const scalar_t *data_value_ptr = data_value + value_ptr_offset;\n      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;\n\n      for (int p_col = 0; p_col < num_point; ++p_col) {\n        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];\n        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];\n        const scalar_t weight = data_attn_weight[data_weight_ptr];\n\n        const scalar_t h_im = loc_h * spatial_h - 0.5;\n        const scalar_t w_im = loc_w * spatial_w - 0.5;\n        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {\n          ms_deform_attn_col2im_bilinear_gm(\n              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,\n              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,\n              grad_sampling_loc_out, grad_attn_weight_out);\n        }\n        data_weight_ptr += 1;\n        data_loc_w_ptr += 2;\n        grad_attn_weight_out += grad_weight_stride;\n        grad_sampling_loc_out += grad_loc_stride;\n      }\n    }\n  }\n}\n#endif  // DEFORM_ATTN_MUSA_KERNEL\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/nms_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef NMS_MUSA_KERNEL_MUH\n#define NMS_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n\nint const threadsPerBlock = sizeof(unsigned long long int) * 8;\n\n__device__ inline bool devIoU(float const *const a, float const *const b,\n                              const int offset, const float threshold) {\n  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);\n  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);\n  float width = fmaxf(right - left + offset, 0.f),\n        height = fmaxf(bottom - top + offset, 0.f);\n  float interS = width * height;\n  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);\n  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);\n  return interS > threshold * (Sa + Sb - interS);\n}\n\n__global__ static void nms_musa(const int n_boxes, const float iou_threshold,\n                                const int offset, const float *dev_boxes,\n                                unsigned long long *dev_mask) {\n  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;\n  MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {\n    const int tid = threadIdx.x;\n\n    if (row_start > col_start) return;\n\n    const int row_size =\n        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    __shared__ float block_boxes[threadsPerBlock * 4];\n    if (tid < col_size) {\n      block_boxes[tid * 4 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];\n      block_boxes[tid * 4 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];\n      block_boxes[tid * 4 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];\n      block_boxes[tid * 4 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];\n    }\n    __syncthreads();\n\n    if (tid < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + tid;\n      const float *cur_box = dev_boxes + cur_box_idx * 4;\n      int i = 0;\n      unsigned long long int t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = tid + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {\n          t |= 1ULL << i;\n        }\n      }\n      dev_mask[cur_box_idx * gridDim.y + col_start] = t;\n    }\n  }\n}\n\n__global__ static void gather_keep_from_mask(bool *keep,\n                                             const unsigned long long *dev_mask,\n                                             const int n_boxes) {\n  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;\n  const int tid = threadIdx.x;\n\n  // mark the bboxes which have been removed.\n  extern __shared__ unsigned long long removed[];\n\n  // initialize removed.\n  for (int i = tid; i < col_blocks; i += blockDim.x) {\n    removed[i] = 0;\n  }\n  __syncthreads();\n\n  for (int nblock = 0; nblock < col_blocks; ++nblock) {\n    auto removed_val = removed[nblock];\n    __syncthreads();\n    const int i_offset = nblock * threadsPerBlock;\n#pragma unroll\n    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {\n      const int i = i_offset + inblock;\n      if (i >= n_boxes) break;\n      // select a candidate, check if it should kept.\n      if (!(removed_val & (1ULL << inblock))) {\n        if (tid == 0) {\n          // mark the output.\n          keep[i] = true;\n        }\n        auto p = dev_mask + i * col_blocks;\n        // remove all bboxes which overlap the candidate.\n        for (int j = tid; j < col_blocks; j += blockDim.x) {\n          if (j >= nblock) removed[j] |= p[j];\n        }\n        __syncthreads();\n        removed_val = removed[nblock];\n      }\n    }\n  }\n}\n\n#endif  // NMS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/nms_quadri_musa.muh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef NMS_QUADRI_MUSA_MUH\n#define NMS_QUADRI_MUSA_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"box_iou_rotated_utils.hpp\"\n\n__host__ __device__ inline int divideUP(const int x, const int y) {\n  return (((x) + (y)-1) / (y));\n}\n\nnamespace {\nint const threadsPerBlock = sizeof(unsigned long long) * 8;\n}\n\ntemplate <typename T>\n__global__ void nms_quadri_musa_kernel(const int n_boxes,\n                                       const float iou_threshold,\n                                       const T* dev_boxes,\n                                       unsigned long long* dev_mask,\n                                       const int multi_label) {\n  if (multi_label == 1) {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 8 values\n    // (x1, y1, ..., x4, y4) here.\n    __shared__ T block_boxes[threadsPerBlock * 8];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 8 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];\n      block_boxes[threadIdx.x * 8 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];\n      block_boxes[threadIdx.x * 8 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];\n      block_boxes[threadIdx.x * 8 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];\n      block_boxes[threadIdx.x * 8 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];\n      block_boxes[threadIdx.x * 8 + 5] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];\n      block_boxes[threadIdx.x * 8 + 6] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];\n      block_boxes[threadIdx.x * 8 + 7] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 9;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_quadri function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  } else {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 8 values\n    // (x1, y1, , ..., x4, y4) here.\n    __shared__ T block_boxes[threadsPerBlock * 8];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 8 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];\n      block_boxes[threadIdx.x * 8 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];\n      block_boxes[threadIdx.x * 8 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];\n      block_boxes[threadIdx.x * 8 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];\n      block_boxes[threadIdx.x * 8 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];\n      block_boxes[threadIdx.x * 8 + 5] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];\n      block_boxes[threadIdx.x * 8 + 6] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];\n      block_boxes[threadIdx.x * 8 + 7] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 8;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_quadri function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/nms_rotated_musa.muh",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu\n#ifndef NMS_ROTATED_MUSA_MUH\n#define NMS_ROTATED_MUSA_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"box_iou_rotated_utils.hpp\"\n\n__host__ __device__ inline int divideUP(const int x, const int y) {\n  return (((x) + (y)-1) / (y));\n}\n\nnamespace {\nint const threadsPerBlock = sizeof(unsigned long long) * 8;\n}\n\ntemplate <typename T>\n__global__ void nms_rotated_musa_kernel(const int n_boxes,\n                                        const float iou_threshold,\n                                        const T* dev_boxes,\n                                        unsigned long long* dev_mask,\n                                        const int multi_label) {\n  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel\n\n  if (multi_label == 1) {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 5 values\n    // (x_center, y_center, width, height, angle_degrees) here.\n    __shared__ T block_boxes[threadsPerBlock * 5];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 5 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];\n      block_boxes[threadIdx.x * 5 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];\n      block_boxes[threadIdx.x * 5 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];\n      block_boxes[threadIdx.x * 5 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];\n      block_boxes[threadIdx.x * 5 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 6;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_rotated function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  } else {\n    const int row_start = blockIdx.y;\n    const int col_start = blockIdx.x;\n\n    // if (row_start > col_start) return;\n\n    const int row_size =\n        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);\n    const int col_size =\n        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);\n\n    // Compared to nms_cuda_kernel, where each box is represented with 4 values\n    // (x1, y1, x2, y2), each rotated box is represented with 5 values\n    // (x_center, y_center, width, height, angle_degrees) here.\n    __shared__ T block_boxes[threadsPerBlock * 5];\n    if (threadIdx.x < col_size) {\n      block_boxes[threadIdx.x * 5 + 0] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];\n      block_boxes[threadIdx.x * 5 + 1] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];\n      block_boxes[threadIdx.x * 5 + 2] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];\n      block_boxes[threadIdx.x * 5 + 3] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];\n      block_boxes[threadIdx.x * 5 + 4] =\n          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];\n    }\n    __syncthreads();\n\n    if (threadIdx.x < row_size) {\n      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;\n      const T* cur_box = dev_boxes + cur_box_idx * 5;\n      int i = 0;\n      unsigned long long t = 0;\n      int start = 0;\n      if (row_start == col_start) {\n        start = threadIdx.x + 1;\n      }\n      for (i = start; i < col_size; i++) {\n        // Instead of devIoU used by original horizontal nms, here\n        // we use the single_box_iou_rotated function from\n        // box_iou_rotated_utils.h\n        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >\n            iou_threshold) {\n          t |= 1ULL << i;\n        }\n      }\n      const int col_blocks = divideUP(n_boxes, threadsPerBlock);\n      dev_mask[cur_box_idx * col_blocks + col_start] = t;\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/points_in_boxes_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINT_IN_BOXES_MUSA_KERNEL_MUH\n#define POINT_IN_BOXES_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,\n  // cz in the bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];\n  cz += z_size /\n        2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > z_size / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &\n                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void points_in_boxes_part_forward_musa_kernel(\n    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,\n    int *box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:\n  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:\n  // (B, npoints), default -1\n\n  int bs_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (bs_idx >= batch_size) return;\n\n    boxes += bs_idx * boxes_num * 7;\n    pts += bs_idx * pts_num * 3 + pt_idx * 3;\n    box_idx_of_points += bs_idx * pts_num + pt_idx;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = 0;\n    for (int k = 0; k < boxes_num; k++) {\n      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);\n      if (cur_in_flag) {\n        box_idx_of_points[0] = k;\n        break;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void points_in_boxes_all_forward_musa_kernel(\n    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,\n    int *box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:\n  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:\n  // (B, npoints), default -1\n\n  int bs_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (bs_idx >= batch_size) return;\n\n    boxes += bs_idx * boxes_num * 7;\n    pts += bs_idx * pts_num * 3 + pt_idx * 3;\n    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;\n\n    T local_x = 0, local_y = 0;\n    for (int k = 0; k < boxes_num; k++) {\n      const int cur_in_flag =\n          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);\n      if (cur_in_flag) {\n        box_idx_of_points[k] = 1;\n      }\n    }\n  }\n}\n\n#endif  // POINT_IN_BOXES_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/points_in_polygons_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINTS_IN_POLYGONS_MUSA_KERNEL_MUH\n#define POINTS_IN_POLYGONS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\nstruct point {\n  float x, y;\n};\n\ntemplate <typename scalar_t>\n__global__ void points_in_polygons_forward_musa_kernel(\n    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,\n    const int rows, const int cols, scalar_t *inside_flag) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int row = index / cols;\n    int col = index % cols;\n\n    const scalar_t *offset_vertex1 = vertex1 + row * 2;\n    const scalar_t *offset_vertex2 = vertex2 + col * 8;\n\n    point point_[1];\n    point polygon[4];\n\n    point_[0].x = offset_vertex1[0];\n    point_[0].y = offset_vertex1[1];\n\n    polygon[0].x = offset_vertex2[0];\n    polygon[0].y = offset_vertex2[1];\n    polygon[1].x = offset_vertex2[2];\n    polygon[1].y = offset_vertex2[3];\n    polygon[2].x = offset_vertex2[4];\n    polygon[2].y = offset_vertex2[5];\n    polygon[3].x = offset_vertex2[6];\n    polygon[3].y = offset_vertex2[7];\n\n    int nCross = 0;\n    int i, j;\n    float sx, sy, tx, ty, px, py, x;\n    for (i = 0, j = 3; i < 4; j = i, i++) {\n      sx = polygon[i].x;\n      sy = polygon[i].y;\n      tx = polygon[j].x;\n      ty = polygon[j].y;\n\n      px = point_[0].x;\n      py = point_[0].y;\n\n      if (py < min(sy, ty)) continue;\n      if (py > max(sy, ty)) continue;\n\n      if ((sx == px && sy == py) || (tx == px && ty == py)) {\n        break;\n      } else {\n        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {\n          x = sx + (py - sy) * (tx - sx) / (ty - sy);\n          if (x == px) {\n            break;\n          }\n          if (x > px) {\n            nCross++;\n          }\n        }\n      }\n    }\n    if (nCross % 2 == 1) {\n      inside_flag[index] = 1.0;\n    } else {\n      inside_flag[index] = 0.0;\n    }\n    return;\n  }\n}\n\n#endif  // POINTS_IN_POLYGONS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/prroi_pool_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu\n// Distributed under terms of the MIT license.\n#ifndef PRROI_POOL_MUSA_KERNEL_MUH\n#define PRROI_POOL_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,\n                                                        const int h,\n                                                        const int w,\n                                                        const int height,\n                                                        const int width) {\n  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);\n  T retVal = overflow ? 0.0f : data[h * width + w];\n  return retVal;\n}\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {\n  return (1.0f - abs(dh)) * (1.0f - abs(dw));\n}\n\ntemplate <typename T>\n__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,\n                                                                   T c1, T c2) {\n  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;\n}\n\ntemplate <typename T>\n__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,\n                                              const T w, const int height,\n                                              const int width) {\n  T retVal = 0.0f;\n  int h1 = floorf(h);\n  int w1 = floorf(w);\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h) + 1;\n  w1 = floorf(w);\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h);\n  w1 = floorf(w) + 1;\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  h1 = floorf(h) + 1;\n  w1 = floorf(w) + 1;\n  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *\n            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));\n  return retVal;\n}\n\ntemplate <typename T>\n__device__ static T PrRoIPoolingMatCalculation(const T *this_data,\n                                               const int s_h, const int s_w,\n                                               const int e_h, const int e_w,\n                                               const T y0, const T x0,\n                                               const T y1, const T x1,\n                                               const int h0, const int w0) {\n  T alpha, beta, lim_alpha, lim_beta, tmp;\n  T sum_out = 0;\n\n  alpha = x0 - T(s_w);\n  beta = y0 - T(s_h);\n  lim_alpha = x1 - T(s_w);\n  lim_beta = y1 - T(s_h);\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;\n\n  alpha = x0 - T(s_w);\n  beta = T(e_h) - y1;\n  lim_alpha = x1 - T(s_w);\n  lim_beta = T(e_h) - y0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;\n\n  return sum_out;\n}\n\ntemplate <typename T>\n__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,\n                                                  const int h, const int w,\n                                                  const int height,\n                                                  const int width,\n                                                  const T coeff) {\n  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);\n  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);\n}\n\ntemplate <typename T>\n__device__ static void PrRoIPoolingMatDistributeDiff(\n    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,\n    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,\n    const int w0) {\n  T alpha, beta, lim_alpha, lim_beta, tmp;\n\n  alpha = x0 - T(s_w);\n  beta = y0 - T(s_h);\n  lim_alpha = x1 - T(s_w);\n  lim_beta = y1 - T(s_h);\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);\n\n  alpha = x0 - T(s_w);\n  beta = T(e_h) - y1;\n  lim_alpha = x1 - T(s_w);\n  lim_beta = T(e_h) - y0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);\n\n  alpha = T(e_w) - x1;\n  lim_alpha = T(e_w) - x0;\n  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +\n         0.5f * alpha * alpha) *\n        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);\n  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_forward_musa_kernel(\n    const int nthreads, const T *input, const T *rois, T *output,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T *offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    T roi_x1 = offset_rois[1] * spatial_scale;\n    T roi_y1 = offset_rois[2] * spatial_scale;\n    T roi_x2 = offset_rois[3] * spatial_scale;\n    T roi_y2 = offset_rois[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));\n    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T *this_data =\n        input + (roi_batch_ind * channels + c) * height * width;\n    T *this_out = output + index;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n    if (bin_size == 0) {\n      *this_out = 0;\n      continue;\n    }\n\n    T sum_out = 0;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x)\n      for (int bin_y = start_y; bin_y < end_y; ++bin_y)\n        sum_out += PrRoIPoolingMatCalculation(\n            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,\n            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),\n            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,\n            width);\n    *this_out = sum_out / bin_size;\n  }\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_backward_musa_kernel(\n    const int nthreads, const T *grad_output, const T *rois, T *grad_input,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    auto rois_cur = rois + n * 5;\n\n    int roi_batch_ind = rois_cur[0];\n    T roi_x1 = rois_cur[1] * spatial_scale;\n    T roi_y1 = rois_cur[2] * spatial_scale;\n    T roi_x2 = rois_cur[3] * spatial_scale;\n    T roi_y2 = rois_cur[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, (T)0);\n    T roi_height = max(roi_y2 - roi_y1, (T)0);\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T *this_out_grad = grad_output + index;\n    T *this_data_grad =\n        grad_input + (roi_batch_ind * channels + c) * height * width;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n\n    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x)\n      for (int bin_y = start_y; bin_y < end_y; ++bin_y)\n        PrRoIPoolingMatDistributeDiff(\n            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,\n            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),\n            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,\n            width);\n  }\n}\n\ntemplate <typename T>\n__global__ void prroi_pool_coor_backward_musa_kernel(\n    const int nthreads, const T *output, const T *grad_output, const T *input,\n    const T *rois, T *grad_rois, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int channels,\n    const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n    auto rois_cur = rois + n * 5;\n\n    int roi_batch_ind = rois_cur[0];\n    T roi_x1 = rois_cur[1] * spatial_scale;\n    T roi_y1 = rois_cur[2] * spatial_scale;\n    T roi_x2 = rois_cur[3] * spatial_scale;\n    T roi_y2 = rois_cur[4] * spatial_scale;\n\n    T roi_width = max(roi_x2 - roi_x1, (T)0);\n    T roi_height = max(roi_y2 - roi_y1, (T)0);\n    T bin_size_h = roi_height / static_cast<T>(pooled_height);\n    T bin_size_w = roi_width / static_cast<T>(pooled_width);\n\n    const T output_grad_val = grad_output[index];\n    const T *this_input_data =\n        input + (roi_batch_ind * channels + c) * height * width;\n    const T output_val = output[index];\n    T *this_rois_grad = grad_rois + n * 5;\n\n    T bin_x1 = roi_x1 + bin_size_w * pw;\n    T bin_y1 = roi_y1 + bin_size_h * ph;\n    T bin_x2 = bin_x1 + bin_size_w;\n    T bin_y2 = bin_y1 + bin_size_h;\n\n    T bin_size = max(T(0.0), bin_size_w * bin_size_h);\n\n    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;\n\n    // WARNING: to be discussed\n    if (sum_out == 0) continue;\n\n    int start_x, start_y, end_x, end_y;\n\n    start_x = floorf(bin_x1);\n    end_x = ceilf(bin_x2);\n    start_y = floorf(bin_y1);\n    end_y = ceilf(bin_y2);\n\n    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;\n    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {\n      grad_x1_y += PrRoIPoolingSingleCoorIntegral(\n          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,\n                                    height, width));\n\n      grad_x2_y += PrRoIPoolingSingleCoorIntegral(\n          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,\n                                    height, width));\n    }\n\n    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {\n      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(\n          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,\n          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),\n                                    height, width));\n\n      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(\n          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,\n          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),\n                                    height, width),\n          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),\n                                    height, width));\n    }\n\n    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;\n    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;\n    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;\n    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;\n\n    partial_x1 = partial_x1 / bin_size * spatial_scale;\n    partial_x2 = partial_x2 / bin_size * spatial_scale;\n    partial_y1 = partial_y1 / bin_size * spatial_scale;\n    partial_y2 = partial_y2 / bin_size * spatial_scale;\n\n    // (index, x1, y1, x2, y2)\n    this_rois_grad[0] = 0;\n    atomicAdd(this_rois_grad + 1,\n              (partial_x1 * (1.0f - T(pw) / pooled_width) +\n               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *\n                  output_grad_val);\n    atomicAdd(this_rois_grad + 2,\n              (partial_y1 * (1.0f - T(ph) / pooled_height) +\n               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *\n                  output_grad_val);\n    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +\n                                   partial_x1 * T(pw) / pooled_width) *\n                                      output_grad_val);\n    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +\n                                   partial_y1 * T(ph) / pooled_height) *\n                                      output_grad_val);\n  }\n}\n\n#endif  // ROI_POOL_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/psamask_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef PSAMASK_MUSA_KERNEL_MUH\n#define PSAMASK_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\n// MUSA: grid stride looping\n#ifndef MUSA_KERNEL_LOOP\n#define MUSA_KERNEL_LOOP(i, n)                                 \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \\\n       i += blockDim.x * gridDim.x)\n#endif\n\ntemplate <typename T>\n__global__ void psamask_collect_forward_musa(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* mask_data, T* buffer_data) {\n  MUSA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        buffer_data[(n * h_feature * w_feature +\n                     (hidx + h - half_h_mask) * w_feature +\n                     (widx + w - half_w_mask)) *\n                        h_feature * w_feature +\n                    h * w_feature + w] = mask_data\n            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *\n                 w_feature +\n             w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_distribute_forward_musa(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* mask_data, T* buffer_data) {\n  MUSA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *\n                        h_feature * w_feature +\n                    (hidx + h - half_h_mask) * w_feature +\n                    (widx + w - half_w_mask)] = mask_data\n            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *\n                 w_feature +\n             w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_collect_backward_musa(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* buffer_diff, T* mask_diff) {\n  MUSA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +\n                   h) *\n                      w_feature +\n                  w] = buffer_diff[(n * h_feature * w_feature +\n                                    (hidx + h - half_h_mask) * w_feature +\n                                    (widx + w - half_w_mask)) *\n                                       h_feature * w_feature +\n                                   h * w_feature + w];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void psamask_distribute_backward_musa(\n    const int nthreads, const int h_feature, const int w_feature,\n    const int h_mask, const int w_mask, const int half_h_mask,\n    const int half_w_mask, const T* buffer_diff, T* mask_diff) {\n  MUSA_KERNEL_LOOP(index, nthreads) {\n    const int w = index % w_feature;\n    const int h = (index / w_feature) % h_feature;\n    const int n = index / w_feature / h_feature;\n    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed\n    const int hstart = max(0, half_h_mask - h);\n    const int hend = min(h_mask, h_feature + half_h_mask - h);\n    const int wstart = max(0, half_w_mask - w);\n    const int wend = min(w_mask, w_feature + half_w_mask - w);\n    // (hidx,                    widx                   ) with mask-indexed\n    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed\n    for (int hidx = hstart; hidx < hend; hidx++) {\n      for (int widx = wstart; widx < wend; widx++) {\n        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +\n                   h) *\n                      w_feature +\n                  w] =\n            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *\n                            h_feature * w_feature +\n                        (hidx + h - half_h_mask) * w_feature +\n                        (widx + w - half_w_mask)];\n      }\n    }\n  }\n}\n\n#endif  // PSAMASK_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/riroi_align_rotated_musa_kernel.muh",
    "content": "// Modified from\n// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu\n#ifndef RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n#define RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n/*** Forward ***/\ntemplate <typename scalar_t>\n__global__ void riroi_align_rotated_forward_musa_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_rois, const scalar_t spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int num_orientations, scalar_t *top_data) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int o = (index / pooled_width / pooled_height) % num_orientations;\n    int c =\n        (index / pooled_width / pooled_height / num_orientations) % channels;\n    int n = index / pooled_width / pooled_height / num_orientations / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    // Force malformed ROIs to be 1x1\n    roi_width = max(roi_width, (scalar_t)1.);\n    roi_height = max(roi_height, (scalar_t)1.);\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    // find aligned index\n    scalar_t ind_float = theta * num_orientations / (2 * M_PI);\n    int ind = floorf(ind_float);\n    scalar_t l_var = ind_float - (scalar_t)ind;\n    scalar_t r_var = 1.0 - l_var;\n    // correct start channel\n    ind = (ind + num_orientations) % num_orientations;\n    // rotated channel\n    int ind_rot = (o - ind + num_orientations) % num_orientations;\n    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;\n    const scalar_t *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot) *\n                          height * width;\n\n    const scalar_t *offset_bottom_data_plus =\n        bottom_data + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot_plus) *\n                          height * width;\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (num_samples > 0)\n                             ? num_samples\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosscalar_theta = cos(theta);\n    scalar_t sinscalar_theta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    scalar_t output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta (counterclockwise) around the center and translate\n        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;\n        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;\n\n        scalar_t val = bilinear_interpolate<scalar_t>(\n            offset_bottom_data, height, width, y, x, index);\n        scalar_t val_plus = bilinear_interpolate<scalar_t>(\n            offset_bottom_data_plus, height, width, y, x, index);\n        output_val += r_var * val + l_var * val_plus;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename scalar_t>\n__global__ void riroi_align_rotated_backward_musa_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,\n    const scalar_t spatial_scale, const int num_samples, const bool clockwise,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    scalar_t *bottom_diff) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int o = (index / pooled_width / pooled_height) % num_orientations;\n    int c =\n        (index / pooled_width / pooled_height / num_orientations) % channels;\n    int n = index / pooled_width / pooled_height / num_orientations / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not round\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    // Force malformed ROIs to be 1x1\n    roi_width = max(roi_width, (scalar_t)1.);\n    roi_height = max(roi_height, (scalar_t)1.);\n\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    // find aligned index\n    scalar_t ind_float = theta * num_orientations / (2 * M_PI);\n    int ind = floorf(ind_float);\n    scalar_t l_var = ind_float - (scalar_t)ind;\n    scalar_t r_var = 1.0 - l_var;\n    // correct start channel\n    ind = (ind + num_orientations) % num_orientations;\n    // rotated channel\n    int ind_rot = (o - ind + num_orientations) % num_orientations;\n    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;\n    scalar_t *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot) *\n                          height * width;\n    scalar_t *offset_bottom_diff_plus =\n        bottom_diff + (roi_batch_ind * channels * num_orientations +\n                       c * num_orientations + ind_rot_plus) *\n                          height * width;\n    int top_offset =\n        (n * channels * num_orientations + c * num_orientations + o) *\n        pooled_height * pooled_width;\n    const scalar_t *offset_top_diff = top_diff + top_offset;\n    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (num_samples > 0)\n                             ? num_samples\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosTheta = cos(theta);\n    scalar_t sinTheta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta around the center and translate\n        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;\n        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;\n\n        scalar_t w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,\n                                                w4, x_low, x_high, y_low,\n                                                y_high, index);\n\n        scalar_t g1 = top_diff_this_bin * w1 / count;\n        scalar_t g2 = top_diff_this_bin * w2 / count;\n        scalar_t g3 = top_diff_this_bin * w3 / count;\n        scalar_t g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);\n          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);\n          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);\n          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);\n\n          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,\n                    g1 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,\n                    g2 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,\n                    g3 * l_var);\n          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,\n                    g4 * l_var);\n\n        }  // if\n      }    // ix\n    }      // iy\n  }        // MUSA_1D_KERNEL_LOOP\n}  // RiRoIAlignBackward\n\n#endif  // RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/roi_align_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_ALIGN_MUSA_KERNEL_MUH\n#define ROI_ALIGN_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n\n/*** Forward ***/\ntemplate <typename T>\n__global__ void roi_align_forward_musa_kernel(\n    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,\n    T* argmax_x, const int pooled_height, const int pooled_width,\n    const T spatial_scale, const int sampling_ratio,\n    const int pool_mode,  // 0 - max pool, 1 - avg pool\n    const bool aligned, const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_start_w = offset_rois[1] * spatial_scale - offset;\n    T roi_start_h = offset_rois[2] * spatial_scale - offset;\n    T roi_end_w = offset_rois[3] * spatial_scale - offset;\n    T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n    if (!aligned) {  // for backward-compatibility only\n      roi_width = max(roi_width, (T)1.);\n      roi_height = max(roi_height, (T)1.);\n    }\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_height / pooled_height));\n    int roi_bin_grid_w =\n        (sampling_ratio > 0)\n            ? sampling_ratio\n            : static_cast<int>(ceilf(roi_width / pooled_width));\n\n    if (pool_mode == 0) {\n      // We do max pooling inside a bin\n      T maxval = -FLT_MAX;\n      T maxidx_y = -1.f, maxidx_x = -1.f;\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n          T val =\n              bilinear_interpolate(offset_input, height, width, y, x, index);\n          if (val > maxval) {\n            maxval = val;\n            maxidx_y = y;\n            maxidx_x = x;\n          }\n        }\n      }\n      output[index] = maxval;\n      argmax_y[index] = maxidx_y;\n      argmax_x[index] = maxidx_x;\n    } else if (pool_mode == 1) {\n      // We do average pooling inside a bin\n      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);\n      T output_val = 0.;\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n          T val =\n              bilinear_interpolate(offset_input, height, width, y, x, index);\n          output_val += val;\n        }\n      }\n      output[index] = output_val / count;\n    }\n  }\n}\n\n/*** Backward ***/\ntemplate <typename T>\n__global__ void roi_align_backward_musa_kernel(\n    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,\n    const T* argmax_x, T* grad_input, const int pooled_height,\n    const int pooled_width, const T spatial_scale, const int sampling_ratio,\n    const int pool_mode,  // 0 - max pool, 1 - avg pool\n    const bool aligned, const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T grad_output_this_bin = grad_output[index];\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    if (pool_mode == 0) {\n      T y = argmax_y[index], x = argmax_x[index];\n      if (y != -1.f) {\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_grad_input + y_low * width + x_low,\n                    grad_output_this_bin * w1);\n          atomicAdd(offset_grad_input + y_low * width + x_high,\n                    grad_output_this_bin * w2);\n          atomicAdd(offset_grad_input + y_high * width + x_low,\n                    grad_output_this_bin * w3);\n          atomicAdd(offset_grad_input + y_high * width + x_high,\n                    grad_output_this_bin * w4);\n        }\n      }\n    } else if (pool_mode == 1) {\n      // Do not using rounding; this implementation detail is critical\n      T offset = aligned ? (T)0.5 : (T)0.0;\n      T roi_start_w = offset_rois[1] * spatial_scale - offset;\n      T roi_start_h = offset_rois[2] * spatial_scale - offset;\n      T roi_end_w = offset_rois[3] * spatial_scale - offset;\n      T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n      T roi_width = roi_end_w - roi_start_w;\n      T roi_height = roi_end_h - roi_start_h;\n      if (!aligned) {  // for backward-compatibility only\n        roi_width = max(roi_width, (T)1.);\n        roi_height = max(roi_height, (T)1.);\n      }\n\n      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n      // We use roi_bin_grid to sample the grid and mimic integral\n      int roi_bin_grid_h =\n          (sampling_ratio > 0)\n              ? sampling_ratio\n              : static_cast<int>(ceilf(roi_height / pooled_height));\n      int roi_bin_grid_w =\n          (sampling_ratio > 0)\n              ? sampling_ratio\n              : static_cast<int>(ceilf(roi_width / pooled_width));\n\n      // We do average (integral) pooling inside a bin\n      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n\n          T w1, w2, w3, w4;\n          int x_low, x_high, y_low, y_high;\n          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                        x_low, x_high, y_low, y_high, index);\n\n          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n            atomicAdd(offset_grad_input + y_low * width + x_low,\n                      grad_output_this_bin * w1 / count);\n            atomicAdd(offset_grad_input + y_low * width + x_high,\n                      grad_output_this_bin * w2 / count);\n            atomicAdd(offset_grad_input + y_high * width + x_low,\n                      grad_output_this_bin * w3 / count);\n            atomicAdd(offset_grad_input + y_high * width + x_high,\n                      grad_output_this_bin * w4 / count);\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // ROI_ALIGN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/roi_align_rotated_musa_kernel.muh",
    "content": "// Modified from\n// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated\n// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#ifndef ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n#define ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n\n#include <float.h>\n#include \"pytorch_musa_helper.hpp\"\n\n/*** Forward ***/\ntemplate <typename scalar_t>\n__global__ void roi_align_rotated_forward_musa_kernel(\n    const int nthreads, const scalar_t *bottom_data,\n    const scalar_t *bottom_rois, const scalar_t spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, scalar_t *top_data) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    if (!aligned) {  // for backward-compatibility only\n      // Force malformed ROIs to be 1x1\n      roi_width = max(roi_width, (scalar_t)1.);\n      roi_height = max(roi_height, (scalar_t)1.);\n    }\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    const scalar_t *offset_bottom_data =\n        bottom_data + (roi_batch_ind * channels + c) * height * width;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosscalar_theta = cos(theta);\n    scalar_t sinscalar_theta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    scalar_t output_val = 0.;\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta (counterclockwise) around the center and translate\n        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;\n        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;\n\n        scalar_t val = bilinear_interpolate<scalar_t>(\n            offset_bottom_data, height, width, y, x, index);\n        output_val += val;\n      }\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\n/*** Backward ***/\ntemplate <typename scalar_t>\n__global__ void roi_align_rotated_backward_musa_kernel(\n    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,\n    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,\n    const bool clockwise, const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not round\n    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;\n    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;\n    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;\n    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;\n    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;\n    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;\n    scalar_t theta = offset_bottom_rois[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    if (!aligned) {  // for backward-compatibility only\n      // Force malformed ROIs to be 1x1\n      roi_width = max(roi_width, (scalar_t)1.);\n      roi_height = max(roi_height, (scalar_t)1.);\n    }\n    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /\n                          static_cast<scalar_t>(pooled_height);\n    scalar_t bin_size_w =\n        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);\n\n    scalar_t *offset_bottom_diff =\n        bottom_diff + (roi_batch_ind * channels + c) * height * width;\n\n    int top_offset = (n * channels + c) * pooled_height * pooled_width;\n    const scalar_t *offset_top_diff = top_diff + top_offset;\n    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    scalar_t roi_start_h = -roi_height / 2.0;\n    scalar_t roi_start_w = -roi_width / 2.0;\n    scalar_t cosTheta = cos(theta);\n    scalar_t sinTheta = sin(theta);\n\n    // We do average (integral) pooling inside a bin\n    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1\n      const scalar_t yy =\n          roi_start_h + ph * bin_size_h +\n          static_cast<scalar_t>(iy + .5f) * bin_size_h /\n              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const scalar_t xx = roi_start_w + pw * bin_size_w +\n                            static_cast<scalar_t>(ix + .5f) * bin_size_w /\n                                static_cast<scalar_t>(roi_bin_grid_w);\n\n        // Rotate by theta around the center and translate\n        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;\n        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;\n\n        scalar_t w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,\n                                                w4, x_low, x_high, y_low,\n                                                y_high, index);\n\n        scalar_t g1 = top_diff_this_bin * w1 / count;\n        scalar_t g2 = top_diff_this_bin * w2 / count;\n        scalar_t g3 = top_diff_this_bin * w3 / count;\n        scalar_t g4 = top_diff_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);\n          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);\n          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);\n          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);\n        }  // if\n      }    // ix\n    }      // iy\n  }        // MUSA_1D_KERNEL_LOOP\n}  // RoIAlignBackward\n\n#endif  // ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/roi_pool_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_POOL_MUSA_KERNEL_MUH\n#define ROI_POOL_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void roi_pool_forward_musa_kernel(\n    const int nthreads, const T* input, const T* rois, T* output, int* argmax,\n    const int pooled_height, const int pooled_width, const T spatial_scale,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n    // calculate the roi region on feature maps\n    T roi_x1 = offset_rois[1] * spatial_scale;\n    T roi_y1 = offset_rois[2] * spatial_scale;\n    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;\n    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;\n\n    // force malformed rois to be 1x1\n    T roi_w = roi_x2 - roi_x1;\n    T roi_h = roi_y2 - roi_y1;\n    if (roi_w <= 0 || roi_h <= 0) continue;\n\n    T bin_size_w = roi_w / static_cast<T>(pooled_width);\n    T bin_size_h = roi_h / static_cast<T>(pooled_height);\n\n    // the corresponding bin region\n    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);\n    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);\n    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);\n    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);\n\n    // add roi offsets and clip to input boundaries\n    bin_x1 = min(max(bin_x1, 0), width);\n    bin_y1 = min(max(bin_y1, 0), height);\n    bin_x2 = min(max(bin_x2, 0), width);\n    bin_y2 = min(max(bin_y2, 0), height);\n    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);\n\n    const T* offset_input =\n        input + (roi_batch_ind * channels + c) * height * width;\n    // Define an empty pooling region to be zero\n    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd\n    T max_val = is_empty ? 0 : -FLT_MAX;\n    int max_idx = -1;\n    for (int h = bin_y1; h < bin_y2; ++h) {\n      for (int w = bin_x1; w < bin_x2; ++w) {\n        int offset = h * width + w;\n        if (offset_input[offset] > max_val) {\n          max_val = offset_input[offset];\n          max_idx = offset;\n        }\n      }\n    }\n    output[index] = max_val;\n    if (argmax != NULL) argmax[index] = max_idx;\n  }\n}\n\ntemplate <typename T>\n__global__ void roi_pool_backward_musa_kernel(\n    const int nthreads, const T* grad_output, const T* rois, const int* argmax,\n    T* grad_input, const int pooled_height, const int pooled_width,\n    const int channels, const int height, const int width) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c) is an element in the pooled output\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    int roi_batch_ind = rois[n * 5];\n    T* grad_input_offset =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n    int argmax_index = argmax[index];\n\n    if (argmax_index != -1) {\n      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);\n    }\n  }\n}\n\n#endif  // ROI_POOL_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/roiaware_pool3d_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIAWARE_POOL3D_MUSA_KERNEL_MUH\n#define ROIAWARE_POOL3D_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,\n  // cz in the bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];\n  cz += z_size /\n        2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > z_size / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &\n                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,\n                                            int out_x, int out_y, int out_z,\n                                            const T *rois, const T *pts,\n                                            int *pts_mask) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,\n  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,\n  // y_idxs, z_idxs) by binary bit\n  int box_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (box_idx >= boxes_num) return;\n\n    pts += pt_idx * 3;\n    rois += box_idx * 7;\n    pts_mask += box_idx * pts_num + pt_idx;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);\n\n    pts_mask[0] = -1;\n    if (cur_in_flag > 0) {\n      T local_z = pts[2] - rois[2];\n      T x_size = rois[3], y_size = rois[4], z_size = rois[5];\n\n      T x_res = x_size / out_x;\n      T y_res = y_size / out_y;\n      T z_res = z_size / out_z;\n\n      unsigned int x_idx = int((local_x + x_size / 2) / x_res);\n      unsigned int y_idx = int((local_y + y_size / 2) / y_res);\n      unsigned int z_idx = int(local_z / z_res);\n\n      x_idx = min(max(x_idx, 0), out_x - 1);\n      y_idx = min(max(y_idx, 0), out_y - 1);\n      z_idx = min(max(z_idx, 0), out_z - 1);\n\n      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;\n\n      pts_mask[0] = idx_encoding;\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,\n                                             int max_pts_each_voxel, int out_x,\n                                             int out_y, int out_z,\n                                             const int *pts_mask,\n                                             T *pts_idx_of_voxels) {\n  // params pts_mask: (N, npoints)  0 or 1\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  MUSA_1D_KERNEL_LOOP(box_idx, boxes_num) {\n    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;\n\n    for (int k = 0; k < pts_num; k++) {\n      if (pts_mask[box_idx * pts_num + k] != -1) {\n        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];\n        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;\n        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;\n        unsigned int z_idx = idx_encoding & 0xFF;\n        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +\n                                   y_idx * out_z * max_pts_each_voxel +\n                                   z_idx * max_pts_each_voxel;\n        unsigned int cnt = pts_idx_of_voxels[base_offset];\n        if (cnt < max_num_pts) {\n          pts_idx_of_voxels[base_offset + cnt + 1] = k;\n          pts_idx_of_voxels[base_offset]++;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,\n                                   int max_pts_each_voxel, int out_x, int out_y,\n                                   int out_z, const T *pts_feature,\n                                   const int *pts_idx_of_voxels,\n                                   T *pooled_features, int *argmax) {\n  // params pts_feature: (npoints, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),\n  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    pooled_features += box_idx * out_x * out_y * out_z * channels +\n                       offset_base * channels + channel_idx;\n    argmax += box_idx * out_x * out_y * out_z * channels +\n              offset_base * channels + channel_idx;\n\n    int argmax_idx = -1;\n    float max_val = -1e50;\n\n    int total_pts = pts_idx_of_voxels[0];\n\n    for (int k = 1; k <= total_pts; k++) {\n      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >\n          max_val) {\n        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];\n        argmax_idx = pts_idx_of_voxels[k];\n      }\n    }\n\n    if (argmax_idx != -1) {\n      pooled_features[0] = max_val;\n    }\n    argmax[0] = argmax_idx;\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,\n                                   int max_pts_each_voxel, int out_x, int out_y,\n                                   int out_z, const T *pts_feature,\n                                   const int *pts_idx_of_voxels,\n                                   T *pooled_features) {\n  // params pts_feature: (npoints, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),\n  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    pooled_features += box_idx * out_x * out_y * out_z * channels +\n                       offset_base * channels + channel_idx;\n\n    float sum_val = 0;\n    int total_pts = pts_idx_of_voxels[0];\n\n    for (int k = 1; k <= total_pts; k++) {\n      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];\n    }\n\n    if (total_pts > 0) {\n      pooled_features[0] = sum_val / total_pts;\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,\n                                            int out_x, int out_y, int out_z,\n                                            const int *argmax,\n                                            const T *grad_out, T *grad_in) {\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    argmax += box_idx * out_x * out_y * out_z * channels +\n              offset_base * channels + channel_idx;\n    grad_out += box_idx * out_x * out_y * out_z * channels +\n                offset_base * channels + channel_idx;\n\n    if (argmax[0] == -1) return;\n\n    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);\n  }\n}\n\ntemplate <typename T>\n__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,\n                                            int out_x, int out_y, int out_z,\n                                            int max_pts_each_voxel,\n                                            const int *pts_idx_of_voxels,\n                                            const T *grad_out, T *grad_in) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n\n  int box_idx = blockIdx.z;\n  int channel_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {\n    int x_idx = voxel_idx_flat / (out_y * out_z);\n    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;\n    int z_idx = voxel_idx_flat % out_z;\n    if (box_idx >= boxes_num || channel_idx >= channels) return;\n\n    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;\n    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +\n                         offset_base * max_pts_each_voxel;\n    grad_out += box_idx * out_x * out_y * out_z * channels +\n                offset_base * channels + channel_idx;\n\n    int total_pts = pts_idx_of_voxels[0];\n    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);\n    for (int k = 1; k <= total_pts; k++) {\n      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,\n                grad_out[0] * cur_grad);\n    }\n  }\n}\n\n#endif  // ROIAWARE_POOL3D_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/roipoint_pool3d_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIPOINT_POOL3D_MUSA_KERNEL_MUH\n#define ROIPOINT_POOL3D_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,\n                                             T &local_x, T &local_y) {\n  T cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ntemplate <typename T>\n__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,\n                                        T &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the\n  // bottom center\n  T x = pt[0], y = pt[1], z = pt[2];\n  T cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];\n  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > dz / 2.0) return 0;\n  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);\n  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &\n              (local_y > -dy / 2.0) & (local_y < dy / 2.0);\n  return in_flag;\n}\n\ntemplate <typename T>\n__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,\n                                    const T *xyz, const T *boxes3d,\n                                    int *pts_assign) {\n  // params xyz: (B, N, 3)\n  // params boxes3d: (B, M, 7)\n  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means\n  // background points\n  int box_idx = blockIdx.y;\n  int bs_idx = blockIdx.z;\n  MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) {\n    if (box_idx >= boxes_num || bs_idx >= batch_size) return;\n\n    int assign_idx =\n        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;\n    pts_assign[assign_idx] = 0;\n\n    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;\n    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;\n\n    T local_x = 0, local_y = 0;\n    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,\n                                        local_x, local_y);\n    pts_assign[assign_idx] = cur_in_flag;\n  }\n}\n\n__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,\n                               int sampled_pts_num, const int *pts_assign,\n                               int *pts_idx, int *pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params pts_feature: (B, N, C)\n  // params pts_assign: (B, N)\n  // params pts_idx: (B, M, 512)\n  // params pooled_empty_flag: (B, M)\n  MUSA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {\n    int bs_idx = blockIdx.y;\n\n    int cnt = 0;\n    for (int k = 0; k < pts_num; k++) {\n      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +\n                     boxes_idx]) {\n        if (cnt < sampled_pts_num) {\n          pts_idx[bs_idx * boxes_num * sampled_pts_num +\n                  boxes_idx * sampled_pts_num + cnt] = k;\n          cnt++;\n        } else\n          break;\n      }\n    }\n\n    if (cnt == 0) {\n      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;\n    } else if (cnt < sampled_pts_num) {\n      // duplicate same points for sampling\n      for (int k = cnt; k < sampled_pts_num; k++) {\n        int duplicate_idx = k % cnt;\n        int base_offset =\n            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;\n        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void roipoint_pool3d_forward(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,\n    T *pooled_features, int *pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params pts_idx: (B, M, 512)\n  // params pts_feature: (B, N, C)\n  // params pooled_features: (B, M, 512, 3+C)\n  // params pooled_empty_flag: (B, M)\n  int box_idx = blockIdx.y;\n  int bs_idx = blockIdx.z;\n  MUSA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {\n    if (box_idx >= boxes_num || bs_idx >= batch_size) return;\n    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;\n\n    int temp_idx = bs_idx * boxes_num * sampled_pts_num +\n                   box_idx * sampled_pts_num + sample_pt_idx;\n    int src_pt_idx = pts_idx[temp_idx];\n    int dst_feature_offset = temp_idx * (3 + feature_in_len);\n\n    for (int j = 0; j < 3; j++)\n      pooled_features[dst_feature_offset + j] =\n          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];\n\n    int src_feature_offset =\n        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;\n    memcpy(pooled_features + dst_feature_offset + 3,\n           pts_feature + src_feature_offset, feature_in_len * sizeof(T));\n  }\n}\n\n#endif  // ROIPOINT_POOL3D_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/rotated_feature_align_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu\n#ifndef ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH\n#define ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename scalar_t>\n__global__ void rotated_feature_align_forward_kernel(\n    const int nthreads, const int points, const scalar_t* bottom_data,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width, scalar_t* top_data) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    const scalar_t* offset_bottom_data =\n        bottom_data + (n * channels + c) * height * width;\n\n    scalar_t output_val = bottom_data[index];\n    for (int i = 0; i < points; i++) {\n      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,\n                                                   width, py[i], px[i], i);\n    }\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void rotated_feature_align_backward_kernel(\n    const int nthreads, const int points, const scalar_t* top_diff,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width,\n    scalar_t* bottom_diff) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    scalar_t* offset_bottom_diff =\n        bottom_diff + (n * channels + c) * height * width;\n    scalar_t value_top_diff = top_diff[index];\n\n    atomicAdd(bottom_diff + index, value_top_diff);\n    for (int i = 0; i < points; i++) {\n      scalar_t w1, w2, w3, w4;\n      int x_low, x_high, y_low, y_high;\n\n      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,\n                                              w2, w3, w4, x_low, x_high, y_low,\n                                              y_high, i);\n      scalar_t g1 = value_top_diff * w1;\n      scalar_t g2 = value_top_diff * w2;\n      scalar_t g3 = value_top_diff * w3;\n      scalar_t g4 = value_top_diff * w4;\n      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);\n        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);\n        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);\n        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);\n      }\n    }\n  }\n}\n#endif  // ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/scatter_points_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SCATTER_POINTS_MUSA_KERNEL_MUH\n#define SCATTER_POINTS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\nint const maxGridDim = 50000;\n\n__device__ __forceinline__ static void reduceMax(float *address, float val) {\n  int *address_as_i = reinterpret_cast<int *>(address);\n  int old = *address_as_i, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(address_as_i, assumed,\n                    __float_as_int(fmaxf(val, __int_as_float(assumed))));\n  } while (assumed != old || __int_as_float(old) < val);\n}\n\n__device__ __forceinline__ static void reduceMax(double *address, double val) {\n  unsigned long long *address_as_ull =\n      reinterpret_cast<unsigned long long *>(address);\n  unsigned long long old = *address_as_ull, assumed;\n  do {\n    assumed = old;\n    old = atomicCAS(\n        address_as_ull, assumed,\n        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));\n  } while (assumed != old || __longlong_as_double(old) < val);\n}\n\n__device__ __forceinline__ static void reduceAdd(float *address, float val) {\n  atomicAdd(address, val);\n}\n\n__device__ __forceinline__ static void reduceAdd(double *address, double val) {\n  atomicAdd(address, val);\n\n}\n\ntemplate <typename T>\n__global__ void feats_reduce_kernel(\n    const T *feats, const int32_t *coors_map,\n    T *reduced_feats,  // shall be 0 at initialization\n    const int num_input, const int num_feats, const reduce_t reduce_type) {\n  MUSA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n    if (reduce_to == -1) continue;\n\n    const T *feats_offset = feats + x * num_feats;\n    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;\n    if (reduce_type == reduce_t::MAX) {\n      for (int i = 0; i < num_feats; i++) {\n        reduceMax(&reduced_feats_offset[i], feats_offset[i]);\n      }\n    } else {\n      for (int i = 0; i < num_feats; i++) {\n        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void add_reduce_traceback_grad_kernel(\n    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,\n    const int32_t *reduce_count, const int num_input, const int num_feats,\n    const reduce_t reduce_type) {\n  MUSA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n    if (reduce_to == -1) {\n      continue;\n    }\n\n    const int input_offset = x * num_feats;\n    T *grad_feats_offset = grad_feats + input_offset;\n    const int reduced_offset = reduce_to * num_feats;\n    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;\n\n    if (reduce_type == reduce_t::SUM) {\n      for (int i = 0; i < num_feats; i++) {\n        grad_feats_offset[i] = grad_reduced_feats_offset[i];\n      }\n    } else if (reduce_type == reduce_t::MEAN) {\n      for (int i = 0; i < num_feats; i++) {\n        grad_feats_offset[i] = grad_reduced_feats_offset[i] /\n                               static_cast<T>(reduce_count[reduce_to]);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void max_reduce_traceback_scatter_idx_kernel(\n    const T *feats, const T *reduced_feats, int32_t *reduce_from,\n    const int32_t *coors_map, const int num_input, const int num_feats) {\n  MUSA_1D_KERNEL_LOOP(x, num_input) {\n    int32_t reduce_to = coors_map[x];\n\n    const int input_offset = x * num_feats;\n    const T *feats_offset = feats + input_offset;\n\n    if (reduce_to == -1) {\n      continue;\n    }\n\n    const int reduced_offset = reduce_to * num_feats;\n    const T *reduced_feats_offset = reduced_feats + reduced_offset;\n    int32_t *reduce_from_offset = reduce_from + reduced_offset;\n\n    for (int i = 0; i < num_feats; i++) {\n      if (feats_offset[i] == reduced_feats_offset[i]) {\n        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,\n                                               const T *grad_reduced_feats,\n                                               const int32_t *reduce_from,\n                                               const int num_reduced,\n                                               const int num_feats) {\n  MUSA_1D_KERNEL_LOOP(x, num_reduced) {\n    const int reduced_offset = x * num_feats;\n    const int32_t *scatter_to_offset = reduce_from + reduced_offset;\n    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;\n\n    for (int i = 0; i < num_feats; i++) {\n      grad_feats[scatter_to_offset[i] * num_feats + i] =\n          grad_reduced_feats_offset[i];\n    }\n  }\n}\n\n#endif  // SCATTER_POINTS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/sigmoid_focal_loss_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH\n#define SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void sigmoid_focal_loss_forward_musa_kernel(\n    const int nthreads, const T* input, const int64_t* target, const T* weight,\n    T* output, const T gamma, const T alpha, const int num_classes) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n\n    int64_t t = target[n];\n    T flag_p = (t == c);\n    T flag_n = (t != c);\n\n    // p = sigmoid(x) = 1. / 1. + expf(-x)\n    T p = (T)1. / ((T)1. + expf(-input[index]));\n\n    // (1 - p)**gamma * log(p)\n    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));\n    // p**gamma * log(1 - p)\n    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));\n\n    output[index] = (T)0.;\n    output[index] += -flag_p * alpha * term_p;\n    output[index] += -flag_n * ((T)1. - alpha) * term_n;\n    if (weight != NULL) {\n      output[index] *= weight[t];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void sigmoid_focal_loss_backward_musa_kernel(\n    const int nthreads, const T* input, const int64_t* target, const T* weight,\n    T* grad_input, const T gamma, const T alpha, const int num_classes) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n\n    int64_t t = target[n];\n    T flag_p = (t == c);\n    T flag_n = (t != c);\n\n    // p = sigmoid(x) = 1. / 1. + expf(-x)\n    T p = (T)1. / ((T)1. + exp(-input[index]));\n\n    // (1 - p)**gamma * (1 - p - gamma*p*log(p))\n    T term_p = pow(((T)1. - p), gamma) *\n               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));\n    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)\n    T term_n = pow(p, gamma) *\n               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);\n\n    grad_input[index] = (T)0.;\n    grad_input[index] += -flag_p * alpha * term_p;\n    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;\n    if (weight != NULL) {\n      grad_input[index] *= weight[t];\n    }\n  }\n}\n\n#endif  // SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/softmax_focal_loss_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH\n#define SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_forward_musa_kernel(\n    const int nthreads, const T* softmax, const int64_t* target,\n    const T* weight, T* output, const T gamma, const T alpha,\n    const int num_classes) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int64_t label = target[index];\n    T pred = softmax[index * num_classes + label];\n\n    if (label >= 0) {\n      output[index] =\n          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));\n    } else {\n      output[index] = 0;\n    }\n    if (weight != NULL) {\n      output[index] *= weight[label];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_backward_musa1_kernel(\n    const int nthreads, const T* softmax, const int64_t* target,\n    const T* weight, T* buff, const T gamma, const T alpha,\n    const int num_classes) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int64_t label = target[index];\n    T pred = softmax[index * num_classes + label];\n\n    if (label >= 0) {\n      buff[index] = alpha * (-pow((T)1. - pred, gamma) +\n                             gamma * pow((T)1. - pred, gamma - 1) * pred *\n                                 log(max(pred, (T)FLT_MIN)));\n    } else {\n      buff[index] = 0;\n    }\n    if (weight != NULL) {\n      buff[index] *= weight[label];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void softmax_focal_loss_backward_musa2_kernel(\n    const int nthreads, const T* softmax, const int64_t* target, const T* buff,\n    T* grad_input, const int num_classes) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    int n = index / num_classes;\n    int c = index % num_classes;\n    int64_t label = target[n];\n\n    if (label >= 0) {\n      T flag = (label == c ? (T)1. : (T)0.);\n      grad_input[index] = buff[n] * (flag - softmax[index]);\n    } else {\n      grad_input[index] = 0;\n    }\n  }\n}\n\n#endif  // SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/spconv/indice.muh",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef INDICE_MU_H_\n#define INDICE_MU_H_\n#include <utils/spconv/spconv/geometry.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <utils/spconv/tensorview/helper_kernel.muh>\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void prepareIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,\n    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  auto indicePairsDim2 = indicePairs.dim(2);\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n      indicePairs(offset, 0, oldNum) = ix;\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      indicePairs(offset, 1, oldNum) = index;\n      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void prepareDeConvIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,\n    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  auto indicePairsDim2 = indicePairs.dim(2);\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPosTranspose<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n      indicePairs(offset, 0, oldNum) = ix;\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      indicePairs(offset, 1, oldNum) = index;\n      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void assignGridAndIndiceOutKernel(\n    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,\n    int numAct, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {\n  Index index;\n  auto indicesOutPtr = indicesOut.data();\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    index = indicePairUnique[ix];\n    gridsOut[index] = ix;\n    index = tv::rowArrayIdxInv<Index, NDim>(\n        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());\n    indicesOut[ix * (NDim + 1)] = index % batchSize;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void assignIndicePairsKernel(\n    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,\n    int numActIn, tv::TensorView<Index> indicePairs,\n    tv::TensorView<Index> indicePairUnique,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  Index index;\n  int kernelVolume = indicePairs.dim(0);\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    for (int i = 0; i < kernelVolume; ++i) {\n      index = indicePairs(i, 1, ix);\n      if (index > -1) {\n        indicePairs(i, 1, ix) = gridsOut[index];\n      }\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void prepareSubMGridKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index index = 0;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,\n                                         outSpatialShape.data()) +\n            spatialVolume * indicesIn(ix, 0);\n    gridsOut[index] = ix;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim,\n          int KernelMaxVolume = 256>\n__global__ void getSubMIndicePairsKernel(\n    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,\n    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,\n    const tv::SimpleVector<Index, NDim> kernelSize,\n    const tv::SimpleVector<Index, NDim> stride,\n    const tv::SimpleVector<Index, NDim> padding,\n    const tv::SimpleVector<Index, NDim> dilation,\n    const tv::SimpleVector<Index, NDim> outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index numValidPoints = 0;\n  Index validPoints[KernelMaxVolume * (NDim + 1)];\n  Index *pointPtr = nullptr;\n  Index index = 0;\n  for (int ix : tv::KernelLoopX<int>(numActIn)) {\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),\n        validPoints);\n    for (int i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +\n              spatialVolume * indicesIn(ix, 0);\n      if (gridsOut[index] > -1) {\n        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));\n        indicePairs(offset, 1, oldNum) = gridsOut[index];\n        indicePairs(offset, 0, oldNum) = ix;\n      }\n    }\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void resetGridKernel(const Index *indicePairUnique,\n                                tv::TensorView<IndexGrid> gridsOut,\n                                int numAct) {\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    gridsOut[indicePairUnique[ix]] = -1;\n  }\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\n__global__ void resetGridSubMKernel(\n    const Index *indices, tv::TensorView<IndexGrid> gridsOut,\n    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {\n  int outSpatialShapeReg[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShapeReg[i] = outSpatialShape[i];\n  }\n  Index spatialVolume = 1;\n  auto indsPtr = indices;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index index;\n  for (int ix : tv::KernelLoopX<int>(numAct)) {\n    indsPtr = indices + ix * (NDim + 1);\n    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);\n    gridsOut[index + spatialVolume * indsPtr[0]] = -1;\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/spconv/reordering.muh",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef REORDERING_MU_H_\n#define REORDERING_MU_H_\n#include <utils/spconv/tensorview/helper_kernel.muh>\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,\n                                    const Index *indices, int size,\n                                    int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size)\n          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =\n              features[inds[ilp] + iy];\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,\n                                const Index *indices, int size, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size)\n          reinterpret_cast<VecType *>(\n              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =\n              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType = int4>\n__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,\n                                     const Index *indices, int size,\n                                     int numPlanes) {\n  int ILPStrideY[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;\n  features += blockIdx.x * NumTLP;\n  buffer += blockIdx.x * NumTLP;\n\n  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      reinterpret_cast<VecType *>(\n          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =\n          reinterpret_cast<const VecType *>(\n              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +\n                        threadIdx.x];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void scatterAddGenericKernel(scalar_t *outFeatures,\n                                        const scalar_t *buffer,\n                                        const Index *indices, int size,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index inds[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < size)\n        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < size) {\n          outFeatures[inds[ilp] + iy] +=\n              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType = int4>\n__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,\n                                         const scalar_t *buffer,\n                                         const Index *indices, int size,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;\n  outFeatures += blockIdx.x * NumTLP;\n  buffer += blockIdx.x * NumTLP;\n  scalar_t buf[vecloadFactor];\n  scalar_t buf2[vecloadFactor];\n  Index idx;\n  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(buf)[0] =\n          reinterpret_cast<VecType *>(outFeatures)[idx];\n      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(\n          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        buf[i] += buf2[i];\n      }\n      reinterpret_cast<VecType *>(outFeatures)[idx] =\n          reinterpret_cast<VecType *>(buf)[0];\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/stack_ball_query_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n#ifndef STACK_BALL_QUERY_MUSA_KERNEL_MUH\n#define STACK_BALL_QUERY_MUSA_KERNEL_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void stack_ball_query_forward_musa_kernel(\n    int B, int M, float radius, int nsample, const T *new_xyz,\n    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,\n    int *idx) {\n  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features\n  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]\n  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query\n  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]\n  // output:\n  //      idx: (M, nsample)\n  const T *cur_xyz = xyz;\n  int *cur_idx = idx;\n  MUSA_1D_KERNEL_LOOP(pt_idx, M) {\n    int bs_idx = 0;\n    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {\n      pt_cnt += new_xyz_batch_cnt[bs_idx];\n      if (pt_idx < pt_cnt) break;\n    }\n\n    int xyz_batch_start_idx = 0;\n    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];\n\n    const T *new_xyz_p = new_xyz + pt_idx * 3;\n    cur_xyz += xyz_batch_start_idx * 3;\n    cur_idx += pt_idx * nsample;\n\n    float radius2 = radius * radius;\n    T new_x = new_xyz_p[0];\n    T new_y = new_xyz_p[1];\n    T new_z = new_xyz_p[2];\n    int n = xyz_batch_cnt[bs_idx];\n\n    int cnt = 0;\n    for (int k = 0; k < n; ++k) {\n      T x = cur_xyz[k * 3 + 0];\n      T y = cur_xyz[k * 3 + 1];\n      T z = cur_xyz[k * 3 + 2];\n      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +\n             (new_z - z) * (new_z - z);\n      if (d2 < radius2) {\n        if (cnt == 0) {\n          for (int l = 0; l < nsample; ++l) {\n            cur_idx[l] = k;\n          }\n        }\n        cur_idx[cnt] = k;\n        ++cnt;\n        if (cnt >= nsample) break;\n      }\n    }\n    if (cnt == 0) cur_idx[0] = -1;\n  }\n}\n\n#endif  // STACK_BALL_QUERY_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/stack_group_points_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#ifndef STACK_GROUP_POINTS_MUSA_KERNEL_MUH\n#define STACK_GROUP_POINTS_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n#include <stdio.h>\ntemplate <typename T>\n__global__ void stack_group_points_forward_musa_kernel(\n    int b, int c, int m, int nsample, const T *features,\n    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,\n    T *out) {\n  // :param features: (N1 + N2 ..., C) tensor of features to group\n  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the\n  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor\n  // containing the indices of features to group with :param idx_batch_cnt:\n  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to\n  // group with :return:\n  //     output: (M1 + M2, C, nsample) tensor\n  MUSA_1D_KERNEL_LOOP(index, m * c * nsample) {\n    const T *cur_features = features;\n    const int *cur_idx = idx;\n    int sample_idx = index % nsample;\n    int c_idx = (index / nsample) % c;\n    int pt_idx = (index / nsample / c);\n\n    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;\n    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];\n    for (int k = 1; k < b; k++) {\n      if (pt_idx < pt_cnt) break;\n      pt_cnt += idx_batch_cnt[k];\n      bs_idx = k;\n    }\n\n    int features_batch_start_idx = 0;\n    int features_batch_end_idx = features_batch_cnt[0];\n    for (int k = 0; k < bs_idx; k++) {\n      features_batch_start_idx += features_batch_cnt[k];\n      features_batch_end_idx =\n          features_batch_start_idx + features_batch_cnt[k + 1];\n    }\n    cur_features += features_batch_start_idx * c;\n\n    cur_idx += pt_idx * nsample + sample_idx;\n    int in_idx = cur_idx[0] * c + c_idx;\n    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;\n    if (in_idx < features_batch_end_idx * c) {\n      out[out_idx] = cur_features[in_idx];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void stack_group_points_backward_musa_kernel(\n    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,\n    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {\n  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the\n  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing\n  // the indices of features to group with :param idx_batch_cnt: (batch_size)\n  // [M1 + M2 ...] tensor containing the indices of features to group with\n  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the\n  // indices of features to group with :return:\n  //     grad_features: (N1 + N2 ..., C) gradient of the features\n  MUSA_1D_KERNEL_LOOP(index, m * c * nsample) {\n    const T *cur_grad_out = grad_out;\n    const int *cur_idx = idx;\n    T *cur_grad_features = grad_features;\n    int sample_idx = index % nsample;\n    int c_idx = (index / nsample) % c;\n    int pt_idx = (index / nsample / c);\n\n    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;\n\n    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];\n    for (int k = 1; k < b; k++) {\n      if (pt_idx < pt_cnt) break;\n      pt_cnt += idx_batch_cnt[k];\n      bs_idx = k;\n    }\n\n    int features_batch_start_idx = 0;\n    for (int k = 0; k < bs_idx; k++)\n      features_batch_start_idx += features_batch_cnt[k];\n\n    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;\n    cur_idx += pt_idx * nsample + sample_idx;\n    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;\n\n    atomicAdd(cur_grad_features, cur_grad_out[0]);\n  }\n}\n\n#endif  // GROUP_POINTS_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/sync_bn_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SYNCBN_MUSA_KERNEL_MUH\n#define SYNCBN_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void sync_bn_forward_mean_musa_kernel(const T *input, float *mean,\n                                                 int num, int channels,\n                                                 int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer[tid] += input[index];\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    mean[c] = buffer[0] / total;\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_mean_musa_kernel(const phalf *input,\n                                                 float *mean, int num,\n                                                 int channels, int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer[tid] += static_cast<float>(input[index]);\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    mean[c] = buffer[0] / total;\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_forward_var_musa_kernel(const T *input,\n                                                const float *mean, float *var,\n                                                int num, int channels,\n                                                int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    float td = input[index] - mean[c];\n    buffer[tid] += td * td;\n  }\n  __syncthreads();\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    var[c] = buffer[0] / total;\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_var_musa_kernel(const phalf *input,\n                                                const float *mean, float *var,\n                                                int num, int channels,\n                                                int spatial) {\n  __shared__ float buffer[THREADS_PER_BLOCK];\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    float td = static_cast<float>(input[index]) - mean[c];\n    buffer[tid] += td * td;\n  }\n  __syncthreads();\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer[tid] += buffer[tid + s];\n    }\n    __syncthreads();\n  }\n  int total = num * spatial;\n  if (tid == 0) {\n    var[c] = buffer[0] / total;\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_forward_output_musa_kernel(\n    const T *input, const float *mean, const float *var, float *running_mean,\n    float *running_var, const float *weight, const float *bias, float *norm,\n    float *std, T *output, int num, int channels, int spatial, float eps,\n    float momentum, int group_size) {\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  float mean_value = mean[c];\n  float std_value = sqrt(var[c] + eps);\n\n  if (weight != nullptr) {\n    float weight_value = weight[c];\n    float bias_value = bias[c];\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] = (input[index] - mean_value) / std_value;\n        output[index] = norm[index] * weight_value + bias_value;\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] =\n            (input[index] - mean_value) / std_value * weight_value + bias_value;\n      }\n    }\n  } else {\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = norm[index] = (input[index] - mean_value) / std_value;\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = (input[index] - mean_value) / std_value;\n      }\n    }\n  }\n  if (tid == 0) {\n    if (std != nullptr) std[c] = std_value;\n    if (running_mean != nullptr) {\n      running_mean[c] =\n          momentum * mean_value + (1 - momentum) * running_mean[c];\n      int count = num * spatial * group_size;\n      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];\n      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];\n    }\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_forward_output_musa_kernel(\n    const phalf *input, const float *mean, const float *var,\n    float *running_mean, float *running_var, const float *weight,\n    const float *bias, float *norm, float *std, phalf *output, int num,\n    int channels, int spatial, float eps, float momentum, int group_size) {\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  float mean_value = mean[c];\n  float std_value = sqrt(var[c] + eps);\n  if (weight != nullptr) {\n    float weight_value = weight[c];\n    float bias_value = bias[c];\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] =\n            (static_cast<float>(input[index]) - mean_value) / std_value;\n        output[index] =\n            static_cast<phalf>(norm[index] * weight_value + bias_value);\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] =\n            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /\n                                   std_value * weight_value +\n                               bias_value);\n      }\n    }\n  } else {\n    if (norm != nullptr) {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        norm[index] =\n            (static_cast<float>(input[index]) - mean_value) / std_value;\n        output[index] = static_cast<phalf>(norm[index]);\n      }\n    } else {\n      for (int i = tid; i < num * spatial; i += blockDim.x) {\n        int index =\n            (i / spatial) * channels * spatial + c * spatial + i % spatial;\n        output[index] = static_cast<phalf>(\n            (static_cast<float>(input[index]) - mean_value) / std_value);\n      }\n    }\n  }\n  if (tid == 0) {\n    if (std != nullptr) std[c] = std_value;\n    if (running_mean != nullptr) {\n      running_mean[c] =\n          momentum * mean_value + (1 - momentum) * running_mean[c];\n      int count = num * spatial * group_size;\n      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];\n      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_backward_param_musa_kernel(const T *grad_output,\n                                                   const float *norm,\n                                                   float *grad_weight,\n                                                   float *grad_bias, int num,\n                                                   int channels, int spatial) {\n  __shared__ float buffer1[THREADS_PER_BLOCK];\n  __shared__ float buffer2[THREADS_PER_BLOCK];\n\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer1[tid] = buffer2[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer1[tid] += grad_output[index] * norm[index];\n    buffer2[tid] += grad_output[index];\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer1[tid] += buffer1[tid + s];\n      buffer2[tid] += buffer2[tid + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    grad_weight[c] = buffer1[0];\n    grad_bias[c] = buffer2[0];\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_backward_param_musa_kernel(const phalf *grad_output,\n                                                   const float *norm,\n                                                   float *grad_weight,\n                                                   float *grad_bias, int num,\n                                                   int channels, int spatial) {\n  __shared__ float buffer1[THREADS_PER_BLOCK];\n  __shared__ float buffer2[THREADS_PER_BLOCK];\n\n  int tid = threadIdx.x;\n  int c = blockIdx.x;\n  buffer1[tid] = buffer2[tid] = 0;\n  for (int i = tid; i < num * spatial; i += blockDim.x) {\n    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;\n    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];\n    buffer2[tid] += static_cast<float>(grad_output[index]);\n  }\n  __syncthreads();\n\n  for (int s = blockDim.x / 2; s > 0; s >>= 1) {\n    if (tid < s) {\n      buffer1[tid] += buffer1[tid + s];\n      buffer2[tid] += buffer2[tid + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    grad_weight[c] = buffer1[0];\n    grad_bias[c] = buffer2[0];\n  }\n}\n\ntemplate <typename T>\n__global__ void sync_bn_backward_data_musa_kernel(\n    int output_size, const T *grad_output, const float *weight,\n    const float *grad_weight, const float *grad_bias, const float *norm,\n    const float *std, T *grad_input, int num, int channels, int spatial) {\n  int factor = num * spatial;\n  MUSA_1D_KERNEL_LOOP(index, output_size) {\n    int c = (index / spatial) % channels;\n    grad_input[index] =\n        weight[c] *\n        (grad_output[index] -\n         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /\n        std[c];\n  }\n}\n\ntemplate <>\n__global__ void sync_bn_backward_data_musa_kernel(\n    int output_size, const phalf *grad_output, const float *weight,\n    const float *grad_weight, const float *grad_bias, const float *norm,\n    const float *std, phalf *grad_input, int num, int channels, int spatial) {\n  int factor = num * spatial;\n  MUSA_1D_KERNEL_LOOP(index, output_size) {\n    int c = (index / spatial) % channels;\n    grad_input[index] = static_cast<phalf>(\n        weight[c] *\n        (static_cast<float>(grad_output[index]) -\n         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /\n        std[c]);\n  }\n}\n\n#endif  // SYNCBN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/three_interpolate_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_INTERPOLATE_MUSA_KERNEL_MUH\n#define THREE_INTERPOLATE_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void three_interpolate_forward_musa_kernel(\n    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,\n    const T *weight, T *out) {\n  // points: (B, C, M)\n  // idx: (B, N, 3)\n  // weight: (B, N, 3)\n  // output:\n  //      out: (B, C, N)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    weight += bs_idx * n * 3 + pt_idx * 3;\n    points += bs_idx * c * m + c_idx * m;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n    out += bs_idx * c * n + c_idx * n;\n\n    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +\n                  weight[2] * points[idx[2]];\n  }\n}\n\ntemplate <typename T>\n__global__ void three_interpolate_backward_musa_kernel(\n    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,\n    const T *weight, T *grad_points) {\n  // grad_out: (B, C, N)\n  // weight: (B, N, 3)\n  // output:\n  //      grad_points: (B, C, M)\n\n  int bs_idx = blockIdx.z;\n  int c_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b || c_idx >= c) return;\n\n    grad_out += bs_idx * c * n + c_idx * n + pt_idx;\n    weight += bs_idx * n * 3 + pt_idx * 3;\n    grad_points += bs_idx * c * m + c_idx * m;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n\n    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);\n    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);\n    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);\n  }\n}\n\n#endif  // THREE_INTERPOLATE_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/three_nn_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_NN_MUSA_KERNEL_MUH\n#define THREE_NN_MUSA_KERNEL_MUH\n\n\n#include \"pytorch_musa_helper.hpp\"\ntemplate <typename T>\n__global__ void three_nn_forward_musa_kernel(int b, int n, int m,\n                                             const T *unknown, const T *known,\n                                             T *dist2, int *__restrict__ idx) {\n  // unknown: (B, N, 3)\n  // known: (B, M, 3)\n  // output:\n  //      dist2: (B, N, 3)\n  //      idx: (B, N, 3)\n\n  int bs_idx = blockIdx.y;\n  MUSA_1D_KERNEL_LOOP(pt_idx, n) {\n    if (bs_idx >= b) return;\n\n    unknown += bs_idx * n * 3 + pt_idx * 3;\n    known += bs_idx * m * 3;\n    dist2 += bs_idx * n * 3 + pt_idx * 3;\n    idx += bs_idx * n * 3 + pt_idx * 3;\n\n    T ux = unknown[0];\n    T uy = unknown[1];\n    T uz = unknown[2];\n\n    double best1 = 1e40, best2 = 1e40, best3 = 1e40;\n    int besti1 = 0, besti2 = 0, besti3 = 0;\n    for (int k = 0; k < m; ++k) {\n      T x = known[k * 3 + 0];\n      T y = known[k * 3 + 1];\n      T z = known[k * 3 + 2];\n      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);\n      if (d < best1) {\n        best3 = best2;\n        besti3 = besti2;\n        best2 = best1;\n        besti2 = besti1;\n        best1 = d;\n        besti1 = k;\n      } else if (d < best2) {\n        best3 = best2;\n        besti3 = besti2;\n        best2 = d;\n        besti2 = k;\n      } else if (d < best3) {\n        best3 = d;\n        besti3 = k;\n      }\n    }\n    dist2[0] = best1;\n    dist2[1] = best2;\n    dist2[2] = best3;\n    idx[0] = besti1;\n    idx[1] = besti2;\n    idx[2] = besti3;\n  }\n}\n\n#endif  // THREE_NN_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/tin_shift_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef TIN_SHIFT_MUSA_KERNEL_MUH\n#define TIN_SHIFT_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename T>\n__global__ void tin_shift_forward_musa_kernel(\n    const int nthreads, const T* input, const int* shift, T* output,\n    const int batch_size, const int channels, const int t_size,\n    const int hw_size, const int group_size, const int group_channel) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    const int hw_index = index % hw_size;\n    const int j = (index / hw_size) % channels;\n\n    const int n_index = (index / hw_size / channels) % batch_size;\n    int group_id = j / group_channel;\n    int t_shift = shift[n_index * group_size + group_id];\n    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;\n    for (int i = 0; i < t_size; i++) {\n      int now_t = i + t_shift;\n      int data_id = i * hw_size * channels + offset;\n      if (now_t < 0 || now_t >= t_size) {\n        continue;\n      }\n      int out_id = now_t * hw_size * channels + offset;\n      output[out_id] = input[data_id];\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void tin_shift_backward_musa_kernel(\n    const int nthreads, const T* input, const int* shift, T* output,\n    const int batch_size, const int channels, const int t_size,\n    const int hw_size, const int group_size, const int group_channel) {\n  MUSA_1D_KERNEL_LOOP(index, nthreads) {\n    const int hw_index = index % hw_size;\n    const int j = (index / hw_size) % channels;\n\n    const int n_index = (index / hw_size / channels) % batch_size;\n    int group_id = j / group_channel;\n    int t_shift = shift[n_index * group_size + group_id];\n    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;\n    for (int i = 0; i < t_size; i++) {\n      int now_t = i + t_shift;\n      int data_id = i * hw_size * channels + offset;\n      if (now_t < 0 || now_t >= t_size) {\n        continue;\n      }\n      int out_id = now_t * hw_size * channels + offset;\n      output[out_id] = input[data_id];\n    }\n  }\n}\n\n#endif  // TIN_SHIFT_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/musa/voxelization_musa_kernel.muh",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#ifndef VOXELIZATION_MUSA_KERNEL_MUH\n#define VOXELIZATION_MUSA_KERNEL_MUH\n\n#include \"pytorch_musa_helper.hpp\"\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\ntemplate <typename T, typename T_int>\n__global__ void dynamic_voxelize_kernel(\n    const T* points, T_int* coors, const float voxel_x, const float voxel_y,\n    const float voxel_z, const float coors_x_min, const float coors_y_min,\n    const float coors_z_min, const float coors_x_max, const float coors_y_max,\n    const float coors_z_max, const int grid_x, const int grid_y,\n    const int grid_z, const int num_points, const int num_features,\n    const int NDim) {\n  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n  MUSA_1D_KERNEL_LOOP(index, num_points) {\n    // To save some computation\n    auto points_offset = points + index * num_features;\n    auto coors_offset = coors + index * NDim;\n    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);\n    if (c_x < 0 || c_x >= grid_x) {\n      coors_offset[0] = -1;\n      continue;\n    }\n\n    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);\n    if (c_y < 0 || c_y >= grid_y) {\n      coors_offset[0] = -1;\n      coors_offset[1] = -1;\n      continue;\n    }\n\n    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);\n    if (c_z < 0 || c_z >= grid_z) {\n      coors_offset[0] = -1;\n      coors_offset[1] = -1;\n      coors_offset[2] = -1;\n    } else {\n      coors_offset[0] = c_z;\n      coors_offset[1] = c_y;\n      coors_offset[2] = c_x;\n    }\n  }\n}\n\ntemplate <typename T, typename T_int>\n__global__ void assign_point_to_voxel(const int nthreads, const T* points,\n                                      T_int* point_to_voxelidx,\n                                      T_int* coor_to_voxelidx, T* voxels,\n                                      const int max_points,\n                                      const int num_features,\n                                      const int num_points, const int NDim) {\n  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n    int index = thread_idx / num_features;\n\n    int num = point_to_voxelidx[index];\n    int voxelidx = coor_to_voxelidx[index];\n    if (num > -1 && voxelidx > -1) {\n      auto voxels_offset =\n          voxels + voxelidx * max_points * num_features + num * num_features;\n\n      int k = thread_idx % num_features;\n      voxels_offset[k] = points[thread_idx];\n    }\n  }\n}\n\ntemplate <typename T, typename T_int>\n__global__ void assign_voxel_coors(const int nthreads, T_int* coor,\n                                   T_int* point_to_voxelidx,\n                                   T_int* coor_to_voxelidx, T_int* voxel_coors,\n                                   const int num_points, const int NDim) {\n  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;\n    // if (index >= num_points) return;\n    int index = thread_idx / NDim;\n    int num = point_to_voxelidx[index];\n    int voxelidx = coor_to_voxelidx[index];\n    if (num == 0 && voxelidx > -1) {\n      auto coors_offset = voxel_coors + voxelidx * NDim;\n      int k = thread_idx % NDim;\n      coors_offset[k] = coor[thread_idx];\n    }\n  }\n}\n\ntemplate <typename T_int>\n__global__ void point_to_voxelidx_kernel(const T_int* coor,\n                                         T_int* point_to_voxelidx,\n                                         T_int* point_to_pointidx,\n                                         const int max_points,\n                                         const int max_voxels,\n                                         const int num_points, const int NDim) {\n  MUSA_1D_KERNEL_LOOP(index, num_points) {\n    auto coor_offset = coor + index * NDim;\n    // skip invalid points\n    if (coor_offset[0] == -1) continue;\n\n    int num = 0;\n    int coor_x = coor_offset[0];\n    int coor_y = coor_offset[1];\n    int coor_z = coor_offset[2];\n    // only calculate the coors before this coor[index]\n    for (int i = 0; i < index; ++i) {\n      auto prev_coor = coor + i * NDim;\n      if (prev_coor[0] == -1) continue;\n\n      // Find all previous points that have the same coors\n      // if find the same coor, record it\n      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&\n          (prev_coor[2] == coor_z)) {\n        num++;\n        if (num == 1) {\n          // point to the same coor that first show up\n          point_to_pointidx[index] = i;\n        } else if (num >= max_points) {\n          // out of boundary\n          break;\n        }\n      }\n    }\n    if (num == 0) {\n      point_to_pointidx[index] = index;\n    }\n    if (num < max_points) {\n      point_to_voxelidx[index] = num;\n    }\n  }\n}\n\ntemplate <typename T_int>\n__global__ void determin_voxel_num(\n    // const T_int* coor,\n    T_int* num_points_per_voxel, T_int* point_to_voxelidx,\n    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,\n    const int max_points, const int max_voxels, const int num_points) {\n  // only calculate the coors before this coor[index]\n  for (int i = 0; i < num_points; ++i) {\n    int point_pos_in_voxel = point_to_voxelidx[i];\n    // record voxel\n    if (point_pos_in_voxel == -1) {\n      // out of max_points or invalid point\n      continue;\n    } else if (point_pos_in_voxel == 0) {\n      // record new voxel\n      int voxelidx = voxel_num[0];\n      if (voxel_num[0] >= max_voxels) continue;\n      voxel_num[0] += 1;\n      coor_to_voxelidx[i] = voxelidx;\n      num_points_per_voxel[voxelidx] = 1;\n    } else {\n      int point_idx = point_to_pointidx[i];\n      int voxelidx = coor_to_voxelidx[point_idx];\n      if (voxelidx != -1) {\n        coor_to_voxelidx[i] = voxelidx;\n        num_points_per_voxel[voxelidx] += 1;\n      }\n    }\n  }\n}\n\n__global__ void nondeterministic_get_assign_pos(\n    const int nthreads, const int32_t* coors_map, int32_t* pts_id,\n    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {\n  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    int coors_idx = coors_map[thread_idx];\n    if (coors_idx > -1) {\n      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);\n      pts_id[thread_idx] = coors_pts_pos;\n      if (coors_pts_pos == 0) {\n        coors_order[coors_idx] = atomicAdd(coors_count, 1);\n      }\n    }\n  }\n}\n\ntemplate <typename T>\n__global__ void nondeterministic_assign_point_voxel(\n    const int nthreads, const T* points, const int32_t* coors_map,\n    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,\n    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,\n    const int max_voxels, const int max_points, const int num_features,\n    const int NDim) {\n  MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) {\n    int coors_idx = coors_map[thread_idx];\n    int coors_pts_pos = pts_id[thread_idx];\n    if (coors_idx > -1 && coors_pts_pos < max_points) {\n      int coors_pos = coors_order[coors_idx];\n      if (coors_pos < max_voxels) {\n        auto voxels_offset =\n            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;\n        auto points_offset = points + thread_idx * num_features;\n        for (int k = 0; k < num_features; k++) {\n          voxels_offset[k] = points_offset[k];\n        }\n        if (coors_pts_pos == 0) {\n          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);\n          auto coors_offset = coors + coors_pos * NDim;\n          auto coors_in_offset = coors_in + coors_idx * NDim;\n          for (int k = 0; k < NDim; k++) {\n            coors_offset[k] = coors_in_offset[k];\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif  // VOXELIZATION_MUSA_KERNEL_MUH\n"
  },
  {
    "path": "mmcv/ops/csrc/common/parrots_cpp_helper.hpp",
    "content": "#ifndef PARROTS_CPP_HELPER\n#define PARROTS_CPP_HELPER\n#include <parrots/darray/darraymath.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/darraylite.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n#include <vector>\n\nusing namespace parrots;\n\n#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \\\n  case prim_type: {                                     \\\n    using scalar_t = type;                              \\\n    return __VA_ARGS__();                               \\\n  }\n\n#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \\\n  [&] {                                                             \\\n    const auto& the_type = TYPE;                                    \\\n    switch (the_type) {                                             \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \\\n      default:                                                      \\\n        PARROTS_NOTSUPPORTED;                                       \\\n    }                                                               \\\n  }()\n\n#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \\\n  [&] {                                                              \\\n    const auto& the_type = TYPE;                                     \\\n    switch (the_type) {                                              \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \\\n      default:                                                       \\\n        PARROTS_NOTSUPPORTED;                                        \\\n    }                                                                \\\n  }()\n\n#endif  // PARROTS_CPP_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/parrots_cuda_helper.hpp",
    "content": "#ifndef PARROTS_CUDA_HELPER\n#define PARROTS_CUDA_HELPER\n\n#include <cuda.h>\n#include <float.h>\n\n#include <parrots/darray/darraymath.hpp>\n#include <parrots/darray/mathfunctions.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/darrayutil.hpp>\n#include <parrots/foundation/exceptions.hpp>\n#include <parrots/foundation/float16.hpp>\n#include <parrots/foundation/mathfunction.hpp>\n\n#include \"common_cuda_helper.hpp\"\n#include \"parrots_cudawarpfunction.cuh\"\n\nusing namespace parrots;\nusing phalf = float16;\n\n#define __PHALF(x) (x.y)\n\n#define PARROTS_CUDA_CHECK(exp)                         \\\n  do {                                                  \\\n    cudaError_t err = exp;                              \\\n    if (err != cudaSuccess) {                           \\\n      fprintf(stderr, \"cudaCheckError() failed : %s\\n\", \\\n              cudaGetErrorString(err));                 \\\n      exit(-1);                                         \\\n    }                                                   \\\n  } while (0)\n\n#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \\\n  case prim_type: {                                     \\\n    using scalar_t = type;                              \\\n    return __VA_ARGS__();                               \\\n  }\n\n#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \\\n  [&] {                                                             \\\n    const auto& the_type = TYPE;                                    \\\n    switch (the_type) {                                             \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \\\n      default:                                                      \\\n        PARROTS_NOTSUPPORTED;                                       \\\n    }                                                               \\\n  }()\n\n#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \\\n  [&] {                                                              \\\n    const auto& the_type = TYPE;                                     \\\n    switch (the_type) {                                              \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \\\n      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \\\n      default:                                                       \\\n        PARROTS_NOTSUPPORTED;                                        \\\n    }                                                                \\\n  }()\n\n/** atomicAdd **/\n#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600\n\nstatic __inline__ __device__ double atomicAdd(double* address, double val) {\n  unsigned long long int* address_as_ull = (unsigned long long int*)address;\n  unsigned long long int old = *address_as_ull, assumed;\n  if (val == 0.0) return __longlong_as_double(old);\n  do {\n    assumed = old;\n    old = atomicCAS(address_as_ull, assumed,\n                    __double_as_longlong(val + __longlong_as_double(assumed)));\n  } while (assumed != old);\n  return __longlong_as_double(old);\n}\n\n#endif\n\nstatic __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {\n  unsigned int* aligned =\n      (unsigned int*)((size_t)address - ((size_t)address & 2));\n  unsigned int old = *aligned;\n  unsigned int assumed;\n  unsigned short old_as_us;\n  do {\n    assumed = old;\n    old_as_us =\n        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);\n\n#if __CUDACC_VER_MAJOR__ >= 9\n    float16 tmp;\n    tmp.x = old_as_us;\n    float16 sum = tmp + val;\n    unsigned short sum_as_us = sum.x;\n//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))\n//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);\n#else\n    unsigned short sum_as_us =\n        __float2half_rn(__half2float(old_as_us) + (float)(val));\n#endif\n\n    unsigned int sum_as_ui = (size_t)address & 2\n                                 ? (sum_as_us << 16) | (old & 0xffff)\n                                 : (old & 0xffff0000) | sum_as_us;\n    old = atomicCAS(aligned, assumed, sum_as_ui);\n  } while (assumed != old);\n  //__half_raw raw = {old_as_us};\n  // return float16(raw);\n  return *reinterpret_cast<float16*>(&old_as_us);\n}\n#endif  // PARROTS_CUDA_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_cpp_helper.hpp",
    "content": "#ifndef PYTORCH_CPP_HELPER\n#define PYTORCH_CPP_HELPER\n#include <torch/types.h>\n\n#include <vector>\n\nusing namespace at;\n\n#define CHECK_CUDA(x) \\\n  TORCH_CHECK(x.device().is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_MLU(x) \\\n  TORCH_CHECK(x.device().type() == at::kMLU, #x \" must be a MLU tensor\")\n#define CHECK_MUSA(x) \\\n  TORCH_CHECK(x.device().is_privateuseone(), #x \" must be a MUSA tensor\")\n#define CHECK_CPU(x) \\\n  TORCH_CHECK(x.device().type() == at::kCPU, #x \" must be a CPU tensor\")\n#define CHECK_CONTIGUOUS(x) \\\n  TORCH_CHECK(x.is_contiguous(), #x \" must be contiguous\")\n#define CHECK_CUDA_INPUT(x) \\\n  CHECK_CUDA(x);            \\\n  CHECK_CONTIGUOUS(x)\n#define CHECK_MLU_INPUT(x) \\\n  CHECK_MLU(x);            \\\n  CHECK_CONTIGUOUS(x)\n#define CHECK_MUSA_INPUT(x) \\\n  CHECK_MUSA(x);            \\\n  CHECK_CONTIGUOUS(x)\n#define CHECK_CPU_INPUT(x) \\\n  CHECK_CPU(x);            \\\n  CHECK_CONTIGUOUS(x)\n\n#endif  // PYTORCH_CPP_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_cuda_helper.hpp",
    "content": "#ifndef PYTORCH_CUDA_HELPER\n#define PYTORCH_CUDA_HELPER\n\n#include <ATen/ATen.h>\n#ifdef MMCV_WITH_MUSA\n#include \"common_musa_helper.hpp\"\n#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"\n#include \"torch_musa/csrc/core/MUSAGuard.h\"\n#include \"torch_musa/share/generated_cuda_compatible/aten/src/THC/THCAtomics.muh\"\n#include \"torch_musa/share/generated_cuda_compatible/include/ATen/musa/MUSA_PORT_ApplyUtils.muh\"\n#else\n#include <ATen/cuda/CUDAContext.h>\n#include <c10/cuda/CUDAGuard.h>\n\n#include <ATen/cuda/CUDAApplyUtils.cuh>\n#include <THC/THCAtomics.cuh>\n\n#include \"common_cuda_helper.hpp\"\n#endif\n\nusing at::Half;\nusing at::Tensor;\nusing phalf = at::Half;\n\n#define __PHALF(x) (x)\n#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))\n\n#endif  // PYTORCH_CUDA_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_device_registry.hpp",
    "content": "#ifndef PYTORCH_DEVICE_REGISTRY_H\n#define PYTORCH_DEVICE_REGISTRY_H\n\n// Using <torch/extension.h> is recommended in the official documentation in\n// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.\n// However, we use <torch/types.h> for compatibility with CUDA 9.0\n// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.\n#include <torch/types.h>\n\n#include <cassert>\n#include <functional>\n#include <map>\n#include <type_traits>\n\n#ifdef MMCV_WITH_MUSA\n#include \"torch_musa/csrc/aten/utils/Utils.h\"\n#endif\n\ninline std::string GetDeviceStr(const at::Device& device) {\n  std::string str = DeviceTypeName(device.type(), true);\n  if (device.has_index()) {\n    str.push_back(':');\n    str.append(std::to_string(device.index()));\n  }\n  return str;\n}\n\n// Registry\ntemplate <typename F, F f>\nclass DeviceRegistry;\n\ntemplate <typename Ret, typename... Args, Ret (*f)(Args...)>\nclass DeviceRegistry<Ret (*)(Args...), f> {\n public:\n  using FunctionType = Ret (*)(Args...);\n  static const int MAX_DEVICE_TYPES =\n      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);\n\n  void Register(at::DeviceType device, FunctionType function) {\n    funcs_[int8_t(device)] = function;\n  }\n\n  FunctionType Find(at::DeviceType device) const {\n    return funcs_[int8_t(device)];\n  }\n\n  static DeviceRegistry& instance() {\n    static DeviceRegistry inst;\n    return inst;\n  }\n\n private:\n  DeviceRegistry() {\n    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {\n      funcs_[i] = nullptr;\n    }\n  };\n  FunctionType funcs_[MAX_DEVICE_TYPES];\n};\n\n// get device of first tensor param\n\ntemplate <typename T, typename... Args,\n          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,\n                           bool> = true>\nat::Device GetFirstTensorDevice(T&& t, Args&&... args) {\n  return std::forward<T>(t).device();\n}\ntemplate <typename T, typename... Args,\n          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,\n                           bool> = true>\nat::Device GetFirstTensorDevice(T&& t, Args&&... args) {\n  return GetFirstTensorDevice(std::forward<Args>(args)...);\n}\n\n// check device consistency\n\ninline std::pair<int, at::Device> CheckDeviceConsistency(\n    const at::Device& device, int index) {\n  return {index, device};\n}\n\ntemplate <typename T, typename... Args,\n          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,\n                           bool> = true>\nstd::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,\n                                                  int index, T&& t,\n                                                  Args&&... args);\n\ntemplate <typename T, typename... Args,\n          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,\n                           bool> = true>\nstd::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,\n                                                  int index, T&& t,\n                                                  Args&&... args) {\n  auto new_device = std::forward<T>(t).device();\n  if (new_device.type() != device.type() ||\n      new_device.index() != device.index()) {\n    return {index, new_device};\n  }\n  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);\n}\n\ntemplate <\n    typename T, typename... Args,\n    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>\nstd::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,\n                                                  int index, T&& t,\n                                                  Args&&... args) {\n  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);\n}\n\n// dispatch\n\ntemplate <typename R, typename... Args>\nauto Dispatch(const R& registry, const char* name, Args&&... args) {\n  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);\n  auto inconsist =\n      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);\n  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, \": at param \",\n              inconsist.first,\n              \", inconsistent device: \", GetDeviceStr(inconsist.second).c_str(),\n              \" vs \", GetDeviceStr(device).c_str(), \"\\n\")\n  auto f_ptr = registry.Find(device.type());\n  TORCH_CHECK(f_ptr != nullptr, name, \": implementation for device \",\n              GetDeviceStr(device).c_str(), \" not found.\\n\")\n  return f_ptr(std::forward<Args>(args)...);\n}\n\n// helper macro\n\n#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()\n\n#define REGISTER_DEVICE_IMPL(key, device, value)           \\\n  struct key##_##device##_registerer {                     \\\n    key##_##device##_registerer() {                        \\\n      DEVICE_REGISTRY(key).Register(at::k##device, value); \\\n    }                                                      \\\n  };                                                       \\\n  static key##_##device##_registerer _##key##_##device##_registerer;\n\n#define DISPATCH_DEVICE_IMPL(key, ...) \\\n  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)\n\n#endif  // PYTORCH_DEVICE_REGISTRY\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_mlu_helper.hpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#ifndef PYTORCH_MLU_HELPER_HPP_\n#define PYTORCH_MLU_HELPER_HPP_\n\n#ifdef MMCV_WITH_MLU\n#include \"aten.h\"\n\n#define NFU_ALIGN_SIZE 128\n\n#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))\n\n#define PAD_DOWN(x, y) (((x) / (y)) * (y))\n\n#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))\n\n#define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y))\n\ninline int32_t getJobLimitCapability() {\n  CNcontext drv_ctx;\n  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), \"cnCtxGetCurrent fails\");\n  CNctxConfigParam ctx_conf_param;\n  TORCH_CHECK(\n      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,\n                                        &ctx_conf_param),\n      \"cnGetCtxConfigParam fails.\");\n  return (int32_t)ctx_conf_param.unionLimit;\n}\n\ninline int32_t getCoreNumOfJobLimitCapability() {\n  switch (getJobLimitCapability()) {\n    default:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *\n             getJobLimitCapability();\n    case CN_KERNEL_CLASS_BLOCK:\n      return 1;\n    case CN_KERNEL_CLASS_UNION:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);\n    case CN_KERNEL_CLASS_UNION2:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;\n    case CN_KERNEL_CLASS_UNION4:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;\n    case CN_KERNEL_CLASS_UNION8:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;\n    case CN_KERNEL_CLASS_UNION16:\n      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;\n  }\n}\n\n#endif  // MMCV_WITH_MLU\n\n#endif  // PYTORCH_MLU_HELPER_HPP_\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_musa_helper.hpp",
    "content": "#ifndef PYTORCH_MUSA_HELPER\n#define PYTORCH_MUSA_HELPER\n\n#include <ATen/ATen.h>\n\n#include <ATen/musa/MUSA_PORT_ApplyUtils.muh>\n#include <THC/THCAtomics.muh>\n\n#include \"common_musa_helper.hpp\"\n#include \"torch_musa/csrc/aten/musa/Exceptions.h\"\n#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"\n#include \"torch_musa/csrc/core/MUSAGuard.h\"\n\nusing at::Half;\nusing at::Tensor;\nusing phalf = at::Half;\n\n#define __PHALF(x) (x)\n#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))\n\n#endif  // PYTORCH_CUDA_HELPER\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_npu_helper.hpp",
    "content": "/******************************************************************************\n * Copyright (c) 2022 Huawei Technologies Co., Ltd\n * All rights reserved.\n *\n * Licensed under the BSD 3-Clause License  (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * https://opensource.org/licenses/BSD-3-Clause\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef PYTORCH_NPU_HELPER_HPP_\n#define PYTORCH_NPU_HELPER_HPP_\n\n#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>\n#include <torch_npu/csrc/framework/utils/OpAdapter.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#include \"pytorch_npu_util.hpp\"\n\n#define NPU_NAME_SPACE at_npu::native\n\n#ifdef MMCV_WITH_XLA\n#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)\n#else\n#define REGISTER_NPU_IMPL(key, value) \\\n  REGISTER_DEVICE_IMPL(key, PrivateUse1, value)\n#endif\n\n#ifdef MMCV_WITH_XLA\n#define CHECK_NPU(x) \\\n  TORCH_CHECK(x.device().type() == at::kXLA, #x \" must be a NPU tensor\")\n#else\n#define CHECK_NPU(x)                                    \\\n  TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \\\n              \" must be a NPU \"                         \\\n              \"tensor\")\n\n#endif\n#endif  // PYTORCH_NPU_HELPER_HPP_\n"
  },
  {
    "path": "mmcv/ops/csrc/common/pytorch_npu_util.hpp",
    "content": "/******************************************************************************\n * Copyright (c) 2022 Huawei Technologies Co., Ltd\n * All rights reserved.\n *\n * Licensed under the BSD 3-Clause License  (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * https://opensource.org/licenses/BSD-3-Clause\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_\n#define MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_\n\n#include <ATen/Tensor.h>\n#include <acl/acl_base.h>\n#include <acl/acl_rt.h>\n#include <c10/util/Exception.h>\n#include <dlfcn.h>\n#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>\n#include <torch_npu/csrc/framework/utils/OpAdapter.h>\n\n#include <functional>\n#include <type_traits>\n#include <vector>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#include \"torch_npu/csrc/aten/NPUNativeFunctions.h\"\n#include \"torch_npu/csrc/core/npu/NPUStream.h\"\n#include \"torch_npu/csrc/framework/OpCommand.h\"\n#include \"torch_npu/csrc/framework/interface/EnvVariables.h\"\n#include \"torch_npu/csrc/framework/utils/CalcuOpUtil.h\"\n#include \"torch_npu/csrc/framework/utils/OpPreparation.h\"\n\n#define NPU_NAME_SPACE at_npu::native\n\ntypedef struct aclOpExecutor aclOpExecutor;\ntypedef struct aclTensor aclTensor;\ntypedef struct aclScalar aclScalar;\ntypedef struct aclIntArray aclIntArray;\ntypedef struct aclFloatArray aclFloatArray;\ntypedef struct aclBoolArray aclBoolArray;\ntypedef struct aclTensorList aclTensorList;\n\ntypedef aclTensor *(*_aclCreateTensor)(\n    const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type,\n    const int64_t *stride, int64_t offset, aclFormat format,\n    const int64_t *storage_dims, uint64_t storage_dims_num, void *tensor_data);\ntypedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type);\ntypedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size);\ntypedef aclFloatArray *(*_aclCreateFloatArray)(const float *value,\n                                               uint64_t size);\ntypedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size);\ntypedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value,\n                                               uint64_t size);\n\ntypedef int (*_aclDestroyTensor)(const aclTensor *tensor);\ntypedef int (*_aclDestroyScalar)(const aclScalar *scalar);\ntypedef int (*_aclDestroyIntArray)(const aclIntArray *array);\ntypedef int (*_aclDestroyFloatArray)(const aclFloatArray *array);\ntypedef int (*_aclDestroyBoolArray)(const aclBoolArray *array);\ntypedef int (*_aclDestroyTensorList)(const aclTensorList *array);\n\nconstexpr int kHashBufSize = 8192;\nconstexpr int kHashBufMaxSize = kHashBufSize + 1024;\nextern thread_local char g_hashBuf[kHashBufSize];\nextern thread_local int g_hashOffset;\n\n#ifdef MMCV_WITH_XLA\n#define DEVICE_TYPE at_npu::key::NativeDeviceType\n#else\n#define DEVICE_TYPE c10::DeviceType::PrivateUse1\n#endif\n\n#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \\\n  _(at::ScalarType::Byte, ACL_UINT8)                \\\n  _(at::ScalarType::Char, ACL_INT8)                 \\\n  _(at::ScalarType::Short, ACL_INT16)               \\\n  _(at::ScalarType::Int, ACL_INT32)                 \\\n  _(at::ScalarType::Long, ACL_INT64)                \\\n  _(at::ScalarType::Half, ACL_FLOAT16)              \\\n  _(at::ScalarType::Float, ACL_FLOAT)               \\\n  _(at::ScalarType::Double, ACL_DOUBLE)             \\\n  _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED)  \\\n  _(at::ScalarType::ComplexFloat, ACL_COMPLEX64)    \\\n  _(at::ScalarType::ComplexDouble, ACL_COMPLEX128)  \\\n  _(at::ScalarType::Bool, ACL_BOOL)                 \\\n  _(at::ScalarType::QInt8, ACL_DT_UNDEFINED)        \\\n  _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED)       \\\n  _(at::ScalarType::QInt32, ACL_DT_UNDEFINED)       \\\n  _(at::ScalarType::BFloat16, ACL_BF16)             \\\n  _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED)     \\\n  _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED)     \\\n  _(at::ScalarType::Undefined, ACL_DT_UNDEFINED)    \\\n  _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED)\n\nconstexpr aclDataType kATenScalarTypeToAclDataTypeTable\n    [static_cast<int64_t>(at::ScalarType::NumOptions) + 1] = {\n#define DEFINE_ENUM(_1, n) n,\n        AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM)\n#undef DEFINE_ENUM\n};\n\n#define GET_OP_API_FUNC(apiName) \\\n  reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName))\n\n#define MEMCPY_TO_BUF(data_expression, size_expression)               \\\n  if (g_hashOffset + (size_expression) > kHashBufSize) {              \\\n    g_hashOffset = kHashBufMaxSize;                                   \\\n    return;                                                           \\\n  }                                                                   \\\n  memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \\\n  g_hashOffset += size_expression;\n\ninline const char *GetOpApiLibName(void) { return \"libopapi.so\"; }\n\ninline const char *GetCustOpApiLibName(void) { return \"libcust_opapi.so\"; }\n\ninline void *GetOpApiFuncAddrInLib(void *handler, const char *libName,\n                                   const char *apiName) {\n  auto funcAddr = dlsym(handler, apiName);\n  if (funcAddr == nullptr) {\n    ASCEND_LOGW(\"dlsym %s from %s failed, error:%s.\", apiName, libName,\n                dlerror());\n  }\n  return funcAddr;\n}\n\ninline void *GetOpApiLibHandler(const char *libName) {\n  auto handler = dlopen(libName, RTLD_LAZY);\n  if (handler == nullptr) {\n    ASCEND_LOGW(\"dlopen %s failed, error:%s.\", libName, dlerror());\n  }\n  return handler;\n}\n\ninline void *GetOpApiFuncAddr(const char *apiName) {\n  static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName());\n  if (custOpApiHandler != nullptr) {\n    auto funcAddr =\n        GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName);\n    if (funcAddr != nullptr) {\n      return funcAddr;\n    }\n  }\n\n  static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName());\n  if (opApiHandler == nullptr) {\n    return nullptr;\n  }\n  return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName);\n}\n\ninline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) {\n  c10::Scalar expScalar;\n  const at::Tensor *aclInput = &tensor;\n  if (aclInput->scalar_type() == at::ScalarType::Double) {\n    double value = *(double *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::Long) {\n    int64_t value = *(int64_t *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::Float) {\n    float value = *(float *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::Int) {\n    int value = *(int *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::Half) {\n    c10::Half value = *(c10::Half *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::Bool) {\n    int8_t value = *(int8_t *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) {\n    c10::complex<double> value = *(c10::complex<double> *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) {\n    c10::complex<float> value = *(c10::complex<float> *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) {\n    c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr();\n    c10::Scalar scalar(value);\n    expScalar = scalar;\n  }\n  return expScalar;\n}\n\ninline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) {\n  at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory();\n  int deviceIndex = 0;\n  return cpuPinMemTensor.to(c10::Device(DEVICE_TYPE, deviceIndex),\n                            cpuPinMemTensor.scalar_type(), true, true);\n}\n\ninline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar,\n                                     at::ScalarType scalar_data_type) {\n  return CopyTensorHostToDevice(\n      scalar_to_tensor(cpu_scalar).to(scalar_data_type));\n}\n\ninline aclTensor *ConvertType(const at::Tensor &at_tensor) {\n  static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);\n  if (aclCreateTensor == nullptr) {\n    return nullptr;\n  }\n\n  if (!at_tensor.defined()) {\n    return nullptr;\n  }\n  at::ScalarType scalar_data_type = at_tensor.scalar_type();\n  aclDataType acl_data_type =\n      kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalar_data_type)];\n  TORCH_CHECK(\n      acl_data_type != ACL_DT_UNDEFINED,\n      std::string(c10::toString(scalar_data_type)) + \" has not been supported\")\n  c10::SmallVector<int64_t, 5> storageDims;\n  // if acl_data_type is ACL_STRING, storageDims is empty.\n  auto itemsize = at_tensor.itemsize();\n  if (itemsize == 0) {\n    AT_ERROR(\"When ConvertType, tensor item size of cannot be zero.\");\n    return nullptr;\n  }\n  if (acl_data_type != ACL_STRING) {\n    storageDims.push_back(at_tensor.storage().nbytes() / itemsize);\n  }\n\n  const auto dimNum = at_tensor.sizes().size();\n  aclFormat format = ACL_FORMAT_ND;\n  switch (dimNum) {\n    case 3:\n      format = ACL_FORMAT_NCL;\n      break;\n    case 4:\n      format = ACL_FORMAT_NCHW;\n      break;\n    case 5:\n      format = ACL_FORMAT_NCDHW;\n      break;\n    default:\n      format = ACL_FORMAT_ND;\n  }\n\n  if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) {\n    c10::Scalar expScalar = ConvertTensorToScalar(at_tensor);\n    at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type);\n    return aclCreateTensor(aclInput.sizes().data(), aclInput.sizes().size(),\n                           acl_data_type, aclInput.strides().data(),\n                           aclInput.storage_offset(), format,\n                           storageDims.data(), storageDims.size(),\n                           const_cast<void *>(aclInput.storage().data()));\n  }\n\n  auto acl_tensor = aclCreateTensor(\n      at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type,\n      at_tensor.strides().data(), at_tensor.storage_offset(), format,\n      storageDims.data(), storageDims.size(),\n      const_cast<void *>(at_tensor.storage().data()));\n  return acl_tensor;\n}\n\ninline aclScalar *ConvertType(const at::Scalar &at_scalar) {\n  static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar);\n  if (aclCreateScalar == nullptr) {\n    return nullptr;\n  }\n\n  at::ScalarType scalar_data_type = at_scalar.type();\n  aclDataType acl_data_type =\n      kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalar_data_type)];\n  TORCH_CHECK(\n      acl_data_type != ACL_DT_UNDEFINED,\n      std::string(c10::toString(scalar_data_type)) + \" has not been supported\")\n  aclScalar *acl_scalar = nullptr;\n  switch (scalar_data_type) {\n    case at::ScalarType::Double: {\n      double value = at_scalar.toDouble();\n      acl_scalar = aclCreateScalar(&value, acl_data_type);\n      break;\n    }\n    case at::ScalarType::Long: {\n      int64_t value = at_scalar.toLong();\n      acl_scalar = aclCreateScalar(&value, acl_data_type);\n      break;\n    }\n    case at::ScalarType::Bool: {\n      bool value = at_scalar.toBool();\n      acl_scalar = aclCreateScalar(&value, acl_data_type);\n      break;\n    }\n    case at::ScalarType::ComplexDouble: {\n      auto value = at_scalar.toComplexDouble();\n      acl_scalar = aclCreateScalar(&value, acl_data_type);\n      break;\n    }\n    default:\n      acl_scalar = nullptr;\n      break;\n  }\n  return acl_scalar;\n}\n\ninline aclIntArray *ConvertType(const at::IntArrayRef &at_array) {\n  static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray);\n  if (aclCreateIntArray == nullptr) {\n    return nullptr;\n  }\n  auto array = aclCreateIntArray(at_array.data(), at_array.size());\n  return array;\n}\n\ntemplate <std::size_t N>\ninline aclBoolArray *ConvertType(const std::array<bool, N> &value) {\n  static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);\n  if (aclCreateBoolArray == nullptr) {\n    return nullptr;\n  }\n\n  auto array = aclCreateBoolArray(value.data(), value.size());\n  return array;\n}\n\ninline aclBoolArray *ConvertType(const at::ArrayRef<bool> &value) {\n  static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);\n  if (aclCreateBoolArray == nullptr) {\n    return nullptr;\n  }\n\n  auto array = aclCreateBoolArray(value.data(), value.size());\n  return array;\n}\n\ninline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) {\n  static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList);\n  if (aclCreateTensorList == nullptr) {\n    return nullptr;\n  }\n\n  std::vector<const aclTensor *> tensor_list(at_tensor_list.size());\n  for (size_t i = 0; i < at_tensor_list.size(); i++) {\n    tensor_list[i] = ConvertType(at_tensor_list[i]);\n  }\n  auto acl_tensor_list =\n      aclCreateTensorList(tensor_list.data(), tensor_list.size());\n  return acl_tensor_list;\n}\n\ninline aclTensor *ConvertType(const c10::optional<at::Tensor> &opt_tensor) {\n  if (opt_tensor.has_value() && opt_tensor.value().defined()) {\n    return ConvertType(opt_tensor.value());\n  }\n  return nullptr;\n}\n\ninline aclIntArray *ConvertType(\n    const c10::optional<at::IntArrayRef> &opt_array) {\n  if (opt_array.has_value()) {\n    return ConvertType(opt_array.value());\n  }\n  return nullptr;\n}\n\ninline aclScalar *ConvertType(const c10::optional<at::Scalar> &opt_scalar) {\n  if (opt_scalar.has_value()) {\n    return ConvertType(opt_scalar.value());\n  }\n  return nullptr;\n}\n\ninline aclDataType ConvertType(const at::ScalarType scalarType) {\n  return kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(scalarType)];\n}\n\ntemplate <typename T>\nT ConvertType(T value) {\n  return value;\n}\n\ntemplate <typename Tuple, size_t... I>\nauto ConvertToOpApiFunc(const Tuple &params, void *opApiAddr,\n                        std::index_sequence<I...>) {\n  typedef int (*OpApiFunc)(\n      typename std::decay<decltype(std::get<I>(params))>::type...);\n  auto func = reinterpret_cast<OpApiFunc>(opApiAddr);\n  return func;\n}\n\ntemplate <typename Tuple>\nauto ConvertToOpApiFunc(const Tuple &params, void *opApiAddr) {\n  static constexpr auto size = std::tuple_size<Tuple>::value;\n  return ConvertToOpApiFunc(params, opApiAddr,\n                            std::make_index_sequence<size>{});\n}\n\ninline void Release(aclTensor *p) {\n  static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor);\n  if (aclDestroyTensor == nullptr) {\n    return;\n  }\n  aclDestroyTensor(p);\n}\n\ninline void Release(aclScalar *p) {\n  static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar);\n  if (aclDestroyScalar == nullptr) {\n    return;\n  }\n  aclDestroyScalar(p);\n}\n\ninline void Release(aclIntArray *p) {\n  static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray);\n  if (aclDestroyIntArray == nullptr) {\n    return;\n  }\n\n  aclDestroyIntArray(p);\n}\n\ninline void Release(aclBoolArray *p) {\n  static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray);\n  if (aclDestroyBoolArray == nullptr) {\n    return;\n  }\n\n  aclDestroyBoolArray(p);\n}\n\ninline void Release(aclTensorList *p) {\n  static const auto aclDestroyTensorList =\n      GET_OP_API_FUNC(aclDestroyTensorList);\n  if (aclDestroyTensorList == nullptr) {\n    return;\n  }\n\n  aclDestroyTensorList(p);\n}\n\ntemplate <typename T>\nvoid Release(T value) {\n  (void)value;\n}\n\ntemplate <typename Tuple, size_t... I>\nvoid CallRelease(Tuple t, std::index_sequence<I...>) {\n  (void)std::initializer_list<int>{(Release(std::get<I>(t)), 0)...};\n}\n\ntemplate <typename Tuple>\nvoid ReleaseConvertTypes(Tuple &t) {\n  static constexpr auto size = std::tuple_size<Tuple>::value;\n  CallRelease(t, std::make_index_sequence<size>{});\n}\n\ntemplate <typename... Ts>\nconstexpr auto ConvertTypes(Ts &...args) {\n  return std::make_tuple(ConvertType(args)...);\n}\n\ntemplate <typename Function, typename Tuple, size_t... I>\nauto call(Function f, Tuple t, std::index_sequence<I...>) {\n  return f(std::get<I>(t)...);\n}\n\ntemplate <typename Function, typename Tuple>\nauto call(Function f, Tuple t) {\n  static constexpr auto size = std::tuple_size<Tuple>::value;\n  return call(f, t, std::make_index_sequence<size>{});\n}\n\ntemplate <std::size_t N>\nvoid AddParamToBuf(const std::array<bool, N> &value) {\n  MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool));\n}\n\ntemplate <typename T>\nvoid AddParamToBuf(const T &value) {\n  MEMCPY_TO_BUF(&value, sizeof(T));\n}\n\nvoid AddParamToBuf(const at::Tensor &);\nvoid AddParamToBuf(const at::Scalar &);\nvoid AddParamToBuf(const at::IntArrayRef &);\nvoid AddParamToBuf(const at::ArrayRef<bool> &);\nvoid AddParamToBuf(const at::TensorList &);\nvoid AddParamToBuf(const c10::optional<at::Tensor> &);\nvoid AddParamToBuf(const c10::optional<at::IntArrayRef> &);\nvoid AddParamToBuf(const c10::optional<at::Scalar> &);\nvoid AddParamToBuf(const at::ScalarType);\nvoid AddParamToBuf(const string &);\nvoid AddParamToBuf();\n\ntemplate <typename T, typename... Args>\nvoid AddParamToBuf(const T &arg, Args &...args) {\n  AddParamToBuf(arg);\n  AddParamToBuf(args...);\n}\n\nuint64_t CalcHashId();\ntypedef int (*InitHugeMemThreadLocal)(void *, bool);\ntypedef void (*UnInitHugeMemThreadLocal)(void *, bool);\ntypedef void (*ReleaseHugeMem)(void *, bool);\n\n#define EXEC_NPU_CMD(aclnn_api, ...)                                          \\\n  do {                                                                        \\\n    static const auto getWorkspaceSizeFuncAddr =                              \\\n        GetOpApiFuncAddr(#aclnn_api \"GetWorkspaceSize\");                      \\\n    static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api);           \\\n    static const auto initMemAddr =                                           \\\n        GetOpApiFuncAddr(\"InitHugeMemThreadLocal\");                           \\\n    static const auto unInitMemAddr =                                         \\\n        GetOpApiFuncAddr(\"UnInitHugeMemThreadLocal\");                         \\\n    static const auto releaseMemAddr = GetOpApiFuncAddr(\"ReleaseHugeMem\");    \\\n    TORCH_CHECK(                                                              \\\n        getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr,      \\\n        #aclnn_api, \" or \", #aclnn_api \"GetWorkspaceSize\", \" not in \",        \\\n        GetOpApiLibName(), \", or \", GetOpApiLibName(), \"not found.\");         \\\n    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);           \\\n    uint64_t workspace_size = 0;                                              \\\n    uint64_t *workspace_size_addr = &workspace_size;                          \\\n    aclOpExecutor *executor = nullptr;                                        \\\n    aclOpExecutor **executor_addr = &executor;                                \\\n    InitHugeMemThreadLocal initMemFunc =                                      \\\n        reinterpret_cast<InitHugeMemThreadLocal>(initMemAddr);                \\\n    UnInitHugeMemThreadLocal unInitMemFunc =                                  \\\n        reinterpret_cast<UnInitHugeMemThreadLocal>(unInitMemAddr);            \\\n    if (initMemFunc) {                                                        \\\n      initMemFunc(nullptr, false);                                            \\\n    }                                                                         \\\n    auto converted_params =                                                   \\\n        ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr);        \\\n    static auto getWorkspaceSizeFunc =                                        \\\n        ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr);       \\\n    auto workspace_status = call(getWorkspaceSizeFunc, converted_params);     \\\n    TORCH_CHECK(workspace_status == 0,                                        \\\n                \"call \" #aclnn_api \" failed, detail:\", aclGetRecentErrMsg()); \\\n    void *workspace_addr = nullptr;                                           \\\n    if (workspace_size != 0) {                                                \\\n      at::TensorOptions options =                                             \\\n          at::TensorOptions(torch_npu::utils::get_npu_device_type());         \\\n      auto workspace_tensor =                                                 \\\n          at::empty({workspace_size}, options.dtype(kByte));                  \\\n      workspace_addr = const_cast<void *>(workspace_tensor.storage().data()); \\\n    }                                                                         \\\n    auto acl_call = [converted_params, workspace_addr, workspace_size,        \\\n                     acl_stream, executor]() -> int {                         \\\n      typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *,             \\\n                               const aclrtStream);                            \\\n      OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(opApiFuncAddr);       \\\n      auto api_ret =                                                          \\\n          opApiFunc(workspace_addr, workspace_size, executor, acl_stream);    \\\n      TORCH_CHECK(api_ret == 0, \"call \" #aclnn_api \" failed, detail:\",        \\\n                  aclGetRecentErrMsg());                                      \\\n      ReleaseConvertTypes(converted_params);                                  \\\n      ReleaseHugeMem releaseMemFunc =                                         \\\n          reinterpret_cast<ReleaseHugeMem>(releaseMemAddr);                   \\\n      if (releaseMemFunc) {                                                   \\\n        releaseMemFunc(nullptr, false);                                       \\\n      }                                                                       \\\n      return api_ret;                                                         \\\n    };                                                                        \\\n    at_npu::native::OpCommand cmd;                                            \\\n    cmd.Name(#aclnn_api);                                                     \\\n    cmd.SetCustomHandler(acl_call);                                           \\\n    cmd.Run();                                                                \\\n    if (unInitMemFunc) {                                                      \\\n      unInitMemFunc(nullptr, false);                                          \\\n    }                                                                         \\\n  } while (false)\n\n#endif  // MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/paramsgrid.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef PARAMS_GRID_H_\n#define PARAMS_GRID_H_\n#include <tuple>\n#include <vector>\n\nnamespace detail {\ntemplate <class scalar_t>\nint getTotalSize(std::vector<scalar_t> arg) {\n  return arg.size();\n}\n\ntemplate <class scalar_t, class... TArgs>\nint getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {\n  return arg.size() * getTotalSize(args...);\n}\n\ntemplate <typename scalar_t>\nint getSize(std::vector<scalar_t> arg) {\n  return arg.size();\n}\n\ntemplate <int Idx, class TT, class scalar_t>\nvoid assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {\n  std::get<Idx>(src) = arg[counter[Idx]];\n}\n\ntemplate <int Idx, class TT, class scalar_t, class... TArgs>\nvoid assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,\n              std::vector<TArgs> &...args) {\n  std::get<Idx>(src) = arg[counter[Idx]];\n  assigner<Idx + 1>(src, counter, args...);\n}\n}  // namespace detail\n\ntemplate <class... TArgs>\nstd::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {\n  int length = detail::getTotalSize(args...);\n  std::vector<int> sizes = {detail::getSize(args)...};\n  int size = sizes.size();\n\n  std::vector<std::tuple<TArgs...>> params(length);\n  std::vector<int> counter(size);\n  for (int i = 0; i < length; ++i) {\n    detail::assigner<0>(params[i], counter, args...);\n    counter[size - 1] += 1;\n    for (int c = size - 1; c >= 0; --c) {\n      if (counter[c] == sizes[c] && c > 0) {\n        counter[c - 1] += 1;\n        counter[c] = 0;\n      }\n    }\n  }\n  return params;\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/prettyprint.h",
    "content": "//          Copyright Louis Delacroix 2010 - 2014.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE_1_0.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n//\n// A pretty printing library for C++\n//\n// Usage:\n// Include this header, and operator<< will \"just work\".\n\n#ifndef H_PRETTY_PRINT\n#define H_PRETTY_PRINT\n\n#include <cstddef>\n#include <iterator>\n#include <memory>\n#include <ostream>\n#include <set>\n#include <tuple>\n#include <type_traits>\n#include <unordered_set>\n#include <utility>\n#include <valarray>\n\nnamespace pretty_print {\nnamespace detail {\n// SFINAE type trait to detect whether T::const_iterator exists.\n\nstruct sfinae_base {\n  using yes = char;\n  using no = yes[2];\n};\n\ntemplate <typename T>\nstruct has_const_iterator : private sfinae_base {\n private:\n  template <typename C>\n  static yes &test(typename C::const_iterator *);\n  template <typename C>\n  static no &test(...);\n\n public:\n  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);\n  using type = T;\n};\n\ntemplate <typename T>\nstruct has_begin_end : private sfinae_base {\n private:\n  template <typename C>\n  static yes &\n  f(typename std::enable_if<\n      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()\n                                            const>(&C::begin)),\n                   typename C::const_iterator (C::*)() const>::value>::type *);\n\n  template <typename C>\n  static no &f(...);\n\n  template <typename C>\n  static yes &g(typename std::enable_if<\n                std::is_same<decltype(static_cast<typename C::const_iterator (\n                                          C::*)() const>(&C::end)),\n                             typename C::const_iterator (C::*)() const>::value,\n                void>::type *);\n\n  template <typename C>\n  static no &g(...);\n\n public:\n  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);\n  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);\n};\n\n}  // namespace detail\n\n// Holds the delimiter values for a specific character type\n\ntemplate <typename TChar>\nstruct delimiters_values {\n  using char_type = TChar;\n  const char_type *prefix;\n  const char_type *delimiter;\n  const char_type *postfix;\n};\n\n// Defines the delimiter values for a specific container and character type\n\ntemplate <typename T, typename TChar>\nstruct delimiters {\n  using type = delimiters_values<TChar>;\n  static const type values;\n};\n\n// Functor to print containers. You can use this directly if you want\n// to specify a non-default delimiters type. The printing logic can\n// be customized by specializing the nested template.\n\ntemplate <typename T, typename TChar = char,\n          typename TCharTraits = ::std::char_traits<TChar>,\n          typename TDelimiters = delimiters<T, TChar>>\nstruct print_container_helper {\n  using delimiters_type = TDelimiters;\n  using ostream_type = std::basic_ostream<TChar, TCharTraits>;\n\n  template <typename U>\n  struct printer {\n    static void print_body(const U &c, ostream_type &stream) {\n      using std::begin;\n      using std::end;\n\n      auto it = begin(c);\n      const auto the_end = end(c);\n\n      if (it != the_end) {\n        for (;;) {\n          stream << *it;\n\n          if (++it == the_end) break;\n\n          if (delimiters_type::values.delimiter != NULL)\n            stream << delimiters_type::values.delimiter;\n        }\n      }\n    }\n  };\n\n  print_container_helper(const T &container) : container_(container) {}\n\n  inline void operator()(ostream_type &stream) const {\n    if (delimiters_type::values.prefix != NULL)\n      stream << delimiters_type::values.prefix;\n\n    printer<T>::print_body(container_, stream);\n\n    if (delimiters_type::values.postfix != NULL)\n      stream << delimiters_type::values.postfix;\n  }\n\n private:\n  const T &container_;\n};\n\n// Specialization for pairs\n\ntemplate <typename T, typename TChar, typename TCharTraits,\n          typename TDelimiters>\ntemplate <typename T1, typename T2>\nstruct print_container_helper<T, TChar, TCharTraits,\n                              TDelimiters>::printer<std::pair<T1, T2>> {\n  using ostream_type =\n      typename print_container_helper<T, TChar, TCharTraits,\n                                      TDelimiters>::ostream_type;\n\n  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {\n    stream << c.first;\n    if (print_container_helper<T, TChar, TCharTraits,\n                               TDelimiters>::delimiters_type::values\n            .delimiter != NULL)\n      stream << print_container_helper<T, TChar, TCharTraits,\n                                       TDelimiters>::delimiters_type::values\n                    .delimiter;\n    stream << c.second;\n  }\n};\n\n// Specialization for tuples\n\ntemplate <typename T, typename TChar, typename TCharTraits,\n          typename TDelimiters>\ntemplate <typename... Args>\nstruct print_container_helper<T, TChar, TCharTraits,\n                              TDelimiters>::printer<std::tuple<Args...>> {\n  using ostream_type =\n      typename print_container_helper<T, TChar, TCharTraits,\n                                      TDelimiters>::ostream_type;\n  using element_type = std::tuple<Args...>;\n\n  template <std::size_t I>\n  struct Int {};\n\n  static void print_body(const element_type &c, ostream_type &stream) {\n    tuple_print(c, stream, Int<0>());\n  }\n\n  static void tuple_print(const element_type &, ostream_type &,\n                          Int<sizeof...(Args)>) {}\n\n  static void tuple_print(\n      const element_type &c, ostream_type &stream,\n      typename std::conditional<sizeof...(Args) != 0, Int<0>,\n                                std::nullptr_t>::type) {\n    stream << std::get<0>(c);\n    tuple_print(c, stream, Int<1>());\n  }\n\n  template <std::size_t N>\n  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {\n    if (print_container_helper<T, TChar, TCharTraits,\n                               TDelimiters>::delimiters_type::values\n            .delimiter != NULL)\n      stream << print_container_helper<T, TChar, TCharTraits,\n                                       TDelimiters>::delimiters_type::values\n                    .delimiter;\n\n    stream << std::get<N>(c);\n\n    tuple_print(c, stream, Int<N + 1>());\n  }\n};\n\n// Prints a print_container_helper to the specified stream.\n\ntemplate <typename T, typename TChar, typename TCharTraits,\n          typename TDelimiters>\ninline std::basic_ostream<TChar, TCharTraits> &operator<<(\n    std::basic_ostream<TChar, TCharTraits> &stream,\n    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {\n  helper(stream);\n  return stream;\n}\n\n// Basic is_container template; specialize to derive from std::true_type for all\n// desired container types\n\ntemplate <typename T>\nstruct is_container\n    : public std::integral_constant<bool,\n                                    detail::has_const_iterator<T>::value &&\n                                        detail::has_begin_end<T>::beg_value &&\n                                        detail::has_begin_end<T>::end_value> {};\n\ntemplate <typename T, std::size_t N>\nstruct is_container<T[N]> : std::true_type {};\n\ntemplate <std::size_t N>\nstruct is_container<char[N]> : std::false_type {};\n\ntemplate <typename T>\nstruct is_container<std::valarray<T>> : std::true_type {};\n\ntemplate <typename T1, typename T2>\nstruct is_container<std::pair<T1, T2>> : std::true_type {};\n\ntemplate <typename... Args>\nstruct is_container<std::tuple<Args...>> : std::true_type {};\n\n// Default delimiters\n\ntemplate <typename T>\nstruct delimiters<T, char> {\n  static const delimiters_values<char> values;\n};\ntemplate <typename T>\nconst delimiters_values<char> delimiters<T, char>::values = {\"[\", \", \", \"]\"};\ntemplate <typename T>\nstruct delimiters<T, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\ntemplate <typename T>\nconst delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L\"[\", L\", \",\n                                                                   L\"]\"};\n\n// Delimiters for (multi)set and unordered_(multi)set\n\ntemplate <typename T, typename TComp, typename TAllocator>\nstruct delimiters<::std::set<T, TComp, TAllocator>, char> {\n  static const delimiters_values<char> values;\n};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nconst delimiters_values<char>\n    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {\"{\", \", \",\n                                                                  \"}\"};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nstruct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nconst delimiters_values<wchar_t>\n    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {\n        L\"{\", L\", \", L\"}\"};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nstruct delimiters<::std::multiset<T, TComp, TAllocator>, char> {\n  static const delimiters_values<char> values;\n};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nconst delimiters_values<char>\n    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {\n        \"{\", \", \", \"}\"};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nstruct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\n\ntemplate <typename T, typename TComp, typename TAllocator>\nconst delimiters_values<wchar_t>\n    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {\n        L\"{\", L\", \", L\"}\"};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nstruct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {\n  static const delimiters_values<char> values;\n};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nconst delimiters_values<char> delimiters<\n    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {\n    \"{\", \", \", \"}\"};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nstruct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nconst delimiters_values<wchar_t> delimiters<\n    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {\n    L\"{\", L\", \", L\"}\"};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nstruct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,\n                  char> {\n  static const delimiters_values<char> values;\n};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nconst delimiters_values<char> delimiters<\n    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {\n    \"{\", \", \", \"}\"};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nstruct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,\n                  wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\n\ntemplate <typename T, typename THash, typename TEqual, typename TAllocator>\nconst delimiters_values<wchar_t>\n    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,\n               wchar_t>::values = {L\"{\", L\", \", L\"}\"};\n\n// Delimiters for pair and tuple\n\ntemplate <typename T1, typename T2>\nstruct delimiters<std::pair<T1, T2>, char> {\n  static const delimiters_values<char> values;\n};\ntemplate <typename T1, typename T2>\nconst delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {\n    \"(\", \", \", \")\"};\ntemplate <typename T1, typename T2>\nstruct delimiters<::std::pair<T1, T2>, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\ntemplate <typename T1, typename T2>\nconst delimiters_values<wchar_t>\n    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L\"(\", L\", \", L\")\"};\n\ntemplate <typename... Args>\nstruct delimiters<std::tuple<Args...>, char> {\n  static const delimiters_values<char> values;\n};\ntemplate <typename... Args>\nconst delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {\n    \"(\", \", \", \")\"};\ntemplate <typename... Args>\nstruct delimiters<::std::tuple<Args...>, wchar_t> {\n  static const delimiters_values<wchar_t> values;\n};\ntemplate <typename... Args>\nconst delimiters_values<wchar_t>\n    delimiters<::std::tuple<Args...>, wchar_t>::values = {L\"(\", L\", \", L\")\"};\n\n// Type-erasing helper class for easy use of custom delimiters.\n// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,\n// and MyDelims needs to be defined for TChar. Usage: \"cout <<\n// pretty_print::custom_delims<MyDelims>(x)\".\n\nstruct custom_delims_base {\n  virtual ~custom_delims_base() {}\n  virtual std::ostream &stream(::std::ostream &) = 0;\n  virtual std::wostream &stream(::std::wostream &) = 0;\n};\n\ntemplate <typename T, typename Delims>\nstruct custom_delims_wrapper : custom_delims_base {\n  custom_delims_wrapper(const T &t_) : t(t_) {}\n\n  std::ostream &stream(std::ostream &s) {\n    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(\n               t);\n  }\n\n  std::wostream &stream(std::wostream &s) {\n    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,\n                                       Delims>(t);\n  }\n\n private:\n  const T &t;\n};\n\ntemplate <typename Delims>\nstruct custom_delims {\n  template <typename Container>\n  custom_delims(const Container &c)\n      : base(new custom_delims_wrapper<Container, Delims>(c)) {}\n\n  std::unique_ptr<custom_delims_base> base;\n};\n\ntemplate <typename TChar, typename TCharTraits, typename Delims>\ninline std::basic_ostream<TChar, TCharTraits> &operator<<(\n    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {\n  return p.base->stream(s);\n}\n\n// A wrapper for a C-style array given as pointer-plus-size.\n// Usage: std::cout << pretty_print_array(arr, n) << std::endl;\n\ntemplate <typename T>\nstruct array_wrapper_n {\n  typedef const T *const_iterator;\n  typedef T value_type;\n\n  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}\n  inline const_iterator begin() const { return _array; }\n  inline const_iterator end() const { return _array + _n; }\n\n private:\n  const T *const _array;\n  size_t _n;\n};\n\n// A wrapper for hash-table based containers that offer local iterators to each\n// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket\n// 5 of container m.)\n\ntemplate <typename T>\nstruct bucket_print_wrapper {\n  typedef typename T::const_local_iterator const_iterator;\n  typedef typename T::size_type size_type;\n\n  const_iterator begin() const { return m_map.cbegin(n); }\n\n  const_iterator end() const { return m_map.cend(n); }\n\n  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}\n\n private:\n  const T &m_map;\n  const size_type n;\n};\n\n}  // namespace pretty_print\n\n// Global accessor functions for the convenience wrappers\n\ntemplate <typename T>\ninline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,\n                                                           size_t n) {\n  return pretty_print::array_wrapper_n<T>(a, n);\n}\n\ntemplate <typename T>\npretty_print::bucket_print_wrapper<T> bucket_print(const T &m,\n                                                   typename T::size_type n) {\n  return pretty_print::bucket_print_wrapper<T>(m, n);\n}\n\n// Main magic entry point: An overload snuck into namespace std.\n// Can we do better?\n\nnamespace std {\n// Prints a container to the stream using default delimiters\n\ntemplate <typename T, typename TChar, typename TCharTraits>\ninline typename enable_if<::pretty_print::is_container<T>::value,\n                          basic_ostream<TChar, TCharTraits> &>::type\noperator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {\n  return stream\n         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(\n                container);\n}\n}  // namespace std\n\n#endif  // H_PRETTY_PRINT\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n#include <pybind11/embed.h>\n#include <pybind11/functional.h>\n#include <pybind11/numpy.h>\n#include <pybind11/pybind11.h>\n#include <pybind11/stl.h>\n#include <spconv/tensorview/tensorview.h>\n\n#include <algorithm>\n#include <iostream>\n\nnamespace py = pybind11;\n\ntemplate <typename scalar_t, typename TPyObject>\nstd::vector<scalar_t> array2Vector(TPyObject arr) {\n  py::array arr_np = arr;\n  size_t size = arr.attr(\"size\").template cast<size_t>();\n  py::array_t<scalar_t> arr_cc = arr_np;\n  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);\n  return data;\n}\n\ntemplate <typename scalar_t>\nstd::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {\n  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());\n  return data;\n}\n\ntemplate <typename scalar_t, typename TPyObject>\ntv::TensorView<scalar_t> array2TensorView(TPyObject arr) {\n  py::array arr_np = arr;\n  py::array_t<scalar_t> arr_cc = arr_np;\n  tv::Shape shape;\n  for (int i = 0; i < arr_cc.ndim(); ++i) {\n    shape.push_back(arr_cc.shape(i));\n  }\n  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);\n}\ntemplate <typename scalar_t>\ntv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {\n  tv::Shape shape;\n  for (int i = 0; i < arr.ndim(); ++i) {\n    shape.push_back(arr.shape(i));\n  }\n  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef SPCONV_GEOMETRY_H_\n#define SPCONV_GEOMETRY_H_\n\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <iostream>\n#include <limits>\n\ntemplate <typename Index, unsigned NDim>\nTV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,\n                                    const Index *kernelSize,\n                                    const Index *stride, const Index *padding,\n                                    const Index *dilation,\n                                    const Index *outSpatialShape, Index *out) {\n  Index lowers[NDim];\n  Index uppers[NDim];\n  Index counter[NDim];\n  Index counterSize[NDim];\n  Index pointCounter = 0;\n  Index val;\n  Index numPoints = 1;\n  Index m, offset;\n  bool valid = false;\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +\n                 stride[i] + padding[i]) /\n                stride[i];\n    uppers[i] = (input_pos[i] + padding[i]) / stride[i];\n  }\n\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);\n    numPoints *= counterSize[i];\n  }\n\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    counter[i] = 0;\n  }\n  for (int i = 0; i < numPoints; ++i) {\n    valid = true;\n    m = 1;\n    offset = 0;\n#pragma unroll\n    for (int j = NDim - 1; j >= 0; --j) {\n      val = uppers[j] - counter[j] * dilation[j];\n      out[pointCounter * (NDim + 1) + j] = val;\n      if (val < 0 || (val > outSpatialShape[j] - 1)) {\n        valid = false;\n        // break;\n      }\n      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];\n      m *= kernelSize[j];\n    }\n\n    out[pointCounter * (NDim + 1) + NDim] = offset;\n    if (valid) ++pointCounter;\n    counter[NDim - 1] += 1;\n#pragma unroll\n    for (int c = NDim - 1; c >= 0; --c) {\n      if (counter[c] == counterSize[c] && c > 0) {\n        counter[c - 1] += 1;\n        counter[c] = 0;\n      }\n    }\n  }\n  return pointCounter;\n}\n\ntemplate <typename Index, unsigned NDim>\nTV_HOST_DEVICE Index getValidOutPosTranspose(\n    const Index *input_pos, const Index *kernelSize, const Index *stride,\n    const Index *padding, const Index *dilation, const Index *outSpatialShape,\n    Index *out) {\n  Index lowers[NDim];\n  Index uppers[NDim];\n  Index counter[NDim];\n  Index counterSize[NDim];\n  Index pointCounter = 0;\n  Index val;\n  Index numPoints = 1;\n  Index m, offset;\n  bool valid = false;\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    lowers[i] = input_pos[i] * stride[i] - padding[i];\n    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];\n  }\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);\n    numPoints *= counterSize[i];\n  }\n#pragma unroll\n  for (unsigned i = 0; i < NDim; ++i) {\n    counter[i] = 0;\n  }\n  for (int i = 0; i < numPoints; ++i) {\n    valid = true;\n    m = 1;\n    offset = 0;\n#pragma unroll\n    for (int j = NDim - 1; j >= 0; --j) {\n      val = uppers[j] - counter[j] * dilation[j];\n      out[pointCounter * (NDim + 1) + j] = val;\n      if (val < 0 || (val > outSpatialShape[j] - 1)) {\n        valid = false;\n      }\n      offset += m * (val - lowers[j]) / dilation[j];\n      m *= kernelSize[j];\n    }\n    out[pointCounter * (NDim + 1) + NDim] = offset;\n    if (valid) ++pointCounter;\n    counter[NDim - 1] += 1;\n#pragma unroll\n    for (int c = NDim - 1; c >= 0; --c) {\n      if (counter[c] == counterSize[c] && c > 0) {\n        counter[c - 1] += 1;\n        counter[c] = 0;\n      }\n    }\n  }\n  return pointCounter;\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nIndex getIndicePairsConv(tv::TensorView<const Index> indicesIn,\n                         tv::TensorView<Index> indicesOut,\n                         tv::TensorView<IndexGrid> gridsOut,\n                         tv::TensorView<Index> indicePairs,\n                         tv::TensorView<Index> indiceNum,\n                         const Index *kernelSize, const Index *stride,\n                         const Index *padding, const Index *dilation,\n                         const Index *outSpatialShape) {\n  // indicesOut: num_active * kernelVolume * (NDim + 1)\n  Index numAct = 0;\n  auto numActIn = indicesIn.dim(0);\n  Index batchIdx = 0;\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));\n  Index *validPoints = validPoints_.data();\n  Index *pointPtr = nullptr;\n  for (int j = 0; j < numActIn; ++j) {\n    batchIdx = indicesIn(j, 0);\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,\n        dilation, outSpatialShape, validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +\n                   spatialVolume * batchIdx;\n      if (gridsOut[index] == -1) {\n        for (unsigned k = 1; k < NDim + 1; ++k) {\n          indicesOut(numAct, k) = pointPtr[k - 1];\n        }\n        indicesOut(numAct, 0) = batchIdx;\n        gridsOut[index] = numAct++;\n      }\n      // indicePairs: [K, 2, L]\n      indicePairs(offset, 0, indiceNum[offset]) = j;\n      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];\n    }\n  }\n  return numAct;\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nIndex getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,\n                           tv::TensorView<Index> indicesOut,\n                           tv::TensorView<IndexGrid> gridsOut,\n                           tv::TensorView<Index> indicePairs,\n                           tv::TensorView<Index> indiceNum,\n                           const Index *kernelSize, const Index *stride,\n                           const Index *padding, const Index *dilation,\n                           const Index *outSpatialShape) {\n  Index numAct = 0;\n  auto numActIn = indicesIn.dim(0);\n  Index batchIdx = 0;\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));\n  Index *validPoints = validPoints_.data();\n  Index *pointPtr = nullptr;\n  for (int j = 0; j < numActIn; ++j) {\n    batchIdx = indicesIn(j, 0);\n    numValidPoints = getValidOutPosTranspose<Index, NDim>(\n        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,\n        dilation, outSpatialShape, validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +\n                   spatialVolume * batchIdx;\n      if (gridsOut[index] == -1) {\n        for (unsigned k = 1; k < NDim + 1; ++k) {\n          indicesOut(numAct, k) = pointPtr[k - 1];\n        }\n        indicesOut(numAct, 0) = batchIdx;\n        gridsOut[index] = numAct++;\n      }\n      // indicePairs: [K, 2, L]\n      indicePairs(offset, 0, indiceNum[offset]) = j;\n      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];\n    }\n  }\n  return numAct;\n}\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nIndex getIndicePairsSubM(tv::TensorView<const Index> indicesIn,\n                         tv::TensorView<IndexGrid> gridsOut,\n                         tv::TensorView<Index> indicePairs,\n                         tv::TensorView<Index> indiceNum,\n                         const Index *const kernelSize,\n                         const Index *const stride, const Index *const padding,\n                         const Index *dilation,\n                         const Index *const outSpatialShape) {\n  auto numActIn = indicesIn.dim(0);\n  Index spatialVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    spatialVolume *= outSpatialShape[i];\n  }\n  Index kernelVolume = 1;\n#pragma unroll\n  for (int i = 0; i < NDim; ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  Index numValidPoints = 0;\n  // Index validPoints[kernelVolume * (NDim + 1)];\n  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));\n  Index *validPoints = validPoints_.data();\n  Index *pointPtr = nullptr;\n  Index index = 0;\n  for (int j = 0; j < numActIn; ++j) {\n    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,\n                                         outSpatialShape) +\n            spatialVolume * indicesIn(j, 0);\n    gridsOut[index] = j;\n  }\n  for (int j = 0; j < numActIn; ++j) {\n    numValidPoints = getValidOutPos<Index, NDim>(\n        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,\n        dilation, outSpatialShape, validPoints);\n    for (Index i = 0; i < numValidPoints; ++i) {\n      pointPtr = validPoints + i * (NDim + 1);\n      auto offset = pointPtr[NDim];\n      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +\n              spatialVolume * indicesIn(j, 0);\n      if (gridsOut[index] > -1) {\n        indicePairs(offset, 0, indiceNum[offset]) = j;\n        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];\n      }\n    }\n  }\n  return numActIn;\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/indice.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_\n#define SPARSE_CONV_INDICE_FUNCTOR_H_\n#include <utils/spconv/tensorview/tensorview.h>\n\nnamespace functor {\ntemplate <typename Device, typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP1 {\n  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose);\n};\n\ntemplate <typename Device, typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP2 {\n  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid = false);\n};\n\ntemplate <typename Device, typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctor {\n  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid = false);\n};\n\ntemplate <typename Device, typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateSubMIndicePairFunctor {\n  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid = false);\n};\n}  // namespace functor\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef SPARSE_MAXPOOL_FUNCTOR_H_\n#define SPARSE_MAXPOOL_FUNCTOR_H_\n#include <utils/spconv/tensorview/tensorview.h>\n\nnamespace functor {\ntemplate <typename Device, typename scalar_t, typename Index>\nstruct SparseMaxPoolForwardFunctor {\n  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const Index> indices, int size);\n};\n\ntemplate <typename Device, typename scalar_t, typename Index>\nstruct SparseMaxPoolBackwardFunctor {\n  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const scalar_t> fout,\n                  tv::TensorView<scalar_t> fin,\n                  tv::TensorView<const Index> indices, int size);\n};\n}  // namespace functor\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h",
    "content": "#ifndef MP_HELPER_H_\n#define MP_HELPER_H_\n#include <type_traits>\n#include <utility>\n\ntemplate <class... T>\nstruct mp_list {};\n\ntemplate <class T, T... I>\nusing mp_list_c = mp_list<std::integral_constant<T, I>...>;\n\nnamespace detail {\n\ntemplate <class... T, class F>\nconstexpr F mp_for_each_impl(mp_list<T...>, F &&f) {\n  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);\n}\n\ntemplate <class F>\nconstexpr F mp_for_each_impl(mp_list<>, F &&f) {\n  return std::forward<F>(f);\n}\n\n}  // namespace detail\n\nnamespace detail {\n\ntemplate <class A, template <class...> class B>\nstruct mp_rename_impl {\n  // An error \"no type named 'type'\" here means that the first argument to\n  // mp_rename is not a list\n};\n\ntemplate <template <class...> class A, class... T, template <class...> class B>\nstruct mp_rename_impl<A<T...>, B> {\n  using type = B<T...>;\n};\n\n}  // namespace detail\n\ntemplate <class A, template <class...> class B>\nusing mp_rename = typename ::detail::mp_rename_impl<A, B>::type;\n\ntemplate <class L, class F>\nconstexpr F mp_for_each(F &&f) {\n  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),\n                                    std::forward<F>(f));\n}\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n#include <math.h>\n#include <pybind11/numpy.h>\n#include <pybind11/pybind11.h>\n#include <pybind11/stl.h>\n\n#include <algorithm>\n#include <iostream>\n\nnamespace py = pybind11;\nusing namespace pybind11::literals;\n\ntemplate <typename DType, int NDim>\nint points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,\n                          py::array_t<int> coors,\n                          py::array_t<int> num_points_per_voxel,\n                          py::array_t<int> coor_to_voxelidx,\n                          std::vector<DType> voxel_size,\n                          std::vector<DType> coors_range, int max_points,\n                          int max_voxels) {\n  auto points_rw = points.template mutable_unchecked<2>();\n  auto voxels_rw = voxels.template mutable_unchecked<3>();\n  auto coors_rw = coors.mutable_unchecked<2>();\n  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();\n  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();\n  auto N = points_rw.shape(0);\n  auto num_features = points_rw.shape(1);\n  constexpr int ndim_minus_1 = NDim - 1;\n  int voxel_num = 0;\n  bool failed = false;\n  int coor[NDim];\n  int c;\n  int grid_size[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n  int voxelidx, num;\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      if (voxel_num >= max_voxels) continue;\n      voxel_num += 1;\n      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;\n      for (int k = 0; k < NDim; ++k) {\n        coors_rw(voxelidx, k) = coor[k];\n      }\n    }\n    num = num_points_per_voxel_rw(voxelidx);\n    if (num < max_points) {\n      for (int k = 0; k < num_features; ++k) {\n        voxels_rw(voxelidx, num, k) = points_rw(i, k);\n      }\n      num_points_per_voxel_rw(voxelidx) += 1;\n    }\n  }\n  for (int i = 0; i < voxel_num; ++i) {\n    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;\n  }\n  return voxel_num;\n}\n\ntemplate <typename DType, int NDim>\nint points_to_voxel_3d_np_mean(py::array_t<DType> points,\n                               py::array_t<DType> voxels,\n                               py::array_t<DType> means, py::array_t<int> coors,\n                               py::array_t<int> num_points_per_voxel,\n                               py::array_t<int> coor_to_voxelidx,\n                               std::vector<DType> voxel_size,\n                               std::vector<DType> coors_range, int max_points,\n                               int max_voxels) {\n  auto points_rw = points.template mutable_unchecked<2>();\n  auto means_rw = means.template mutable_unchecked<2>();\n  auto voxels_rw = voxels.template mutable_unchecked<3>();\n  auto coors_rw = coors.mutable_unchecked<2>();\n  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();\n  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();\n  auto N = points_rw.shape(0);\n  auto num_features = points_rw.shape(1);\n  constexpr int ndim_minus_1 = NDim - 1;\n  int voxel_num = 0;\n  bool failed = false;\n  int coor[NDim];\n  int c;\n  int grid_size[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n  int voxelidx, num;\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      if (voxel_num >= max_voxels) continue;\n      voxel_num += 1;\n      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;\n      for (int k = 0; k < NDim; ++k) {\n        coors_rw(voxelidx, k) = coor[k];\n      }\n    }\n    num = num_points_per_voxel_rw(voxelidx);\n    if (num < max_points) {\n      for (int k = 0; k < num_features; ++k) {\n        voxels_rw(voxelidx, num, k) = points_rw(i, k);\n      }\n      num_points_per_voxel_rw(voxelidx) += 1;\n      for (int k = 0; k < num_features; ++k) {\n        means_rw(voxelidx, k) +=\n            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);\n      }\n    }\n  }\n  for (int i = 0; i < voxel_num; ++i) {\n    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;\n    num = num_points_per_voxel_rw(i);\n    for (int j = num; j < max_points; ++j) {\n      for (int k = 0; k < num_features; ++k) {\n        voxels_rw(i, j, k) = means_rw(i, k);\n      }\n    }\n  }\n  return voxel_num;\n}\n\ntemplate <typename DType, int NDim>\nint points_to_voxel_3d_np_height(\n    py::array_t<DType> points, py::array_t<DType> voxels,\n    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,\n    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,\n    std::vector<DType> voxel_size, std::vector<DType> coors_range,\n    int max_points, int max_voxels) {\n  auto points_rw = points.template mutable_unchecked<2>();\n  auto height_rw = height.template mutable_unchecked<2>();\n  auto maxs_rw = maxs.template mutable_unchecked<2>();\n  auto voxels_rw = voxels.template mutable_unchecked<3>();\n  auto coors_rw = coors.mutable_unchecked<2>();\n  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();\n  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();\n  auto N = points_rw.shape(0);\n  auto num_features = points_rw.shape(1);\n  constexpr int ndim_minus_1 = NDim - 1;\n  int voxel_num = 0;\n  bool failed = false;\n  int coor[NDim];\n  int c;\n  int grid_size[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n  int voxelidx, num;\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      if (voxel_num >= max_voxels) continue;\n      voxel_num += 1;\n      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;\n      for (int k = 0; k < NDim; ++k) {\n        coors_rw(voxelidx, k) = coor[k];\n      }\n    }\n    num = num_points_per_voxel_rw(voxelidx);\n    if (num < max_points) {\n      for (int k = 0; k < num_features; ++k) {\n        voxels_rw(voxelidx, num, k) = points_rw(i, k);\n        height_rw(voxelidx, k) =\n            std::min(points_rw(i, k), height_rw(voxelidx, k));\n        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));\n      }\n      num_points_per_voxel_rw(voxelidx) += 1;\n    }\n  }\n  for (int i = 0; i < voxel_num; ++i) {\n    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;\n    for (int k = 0; k < num_features; ++k) {\n      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);\n    }\n  }\n  return voxel_num;\n}\n\ntemplate <typename DType, int NDim>\nint block_filtering(py::array_t<DType> points, py::array_t<int> mask,\n                    py::array_t<DType> height, py::array_t<DType> maxs,\n                    py::array_t<int> coor_to_voxelidx,\n                    std::vector<DType> voxel_size,\n                    std::vector<DType> coors_range, int max_voxels, DType eps) {\n  auto points_rw = points.template mutable_unchecked<2>();\n  auto height_rw = height.template mutable_unchecked<1>();\n  auto maxs_rw = maxs.template mutable_unchecked<1>();\n  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();\n  auto N = points_rw.shape(0);\n  auto num_features = points_rw.shape(1);\n  constexpr int ndim_minus_1 = NDim - 1;\n  int voxel_num = 0;\n  bool failed = false;\n  int coor[NDim];\n  int c;\n  int grid_size[NDim];\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n  int voxelidx, num;\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      voxel_num += 1;\n      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;\n    }\n    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));\n    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));\n  }\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {\n      mask(i) = 0;\n    }\n  }\n}\n\ntemplate <typename DType, int NDim>\nint points_to_voxel_3d_with_filtering(\n    py::array_t<DType> points, py::array_t<DType> voxels,\n    py::array_t<int> voxel_mask, py::array_t<DType> mins,\n    py::array_t<DType> maxs, py::array_t<int> coors,\n    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,\n    std::vector<DType> voxel_size, std::vector<DType> coors_range,\n    int max_points, int max_voxels, int block_factor, int block_size,\n    DType height_threshold) {\n  auto points_rw = points.template mutable_unchecked<2>();\n  auto mins_rw = mins.template mutable_unchecked<2>();\n  auto maxs_rw = maxs.template mutable_unchecked<2>();\n  auto voxels_rw = voxels.template mutable_unchecked<3>();\n  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();\n  auto coors_rw = coors.mutable_unchecked<2>();\n  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();\n  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();\n  auto N = points_rw.shape(0);\n  auto num_features = points_rw.shape(1);\n  constexpr int ndim_minus_1 = NDim - 1;\n  int voxel_num = 0;\n  bool failed = false;\n  int coor[NDim];\n  int c;\n  int grid_size[NDim];\n\n  DType max_value, min_value;\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n  int block_shape_H = grid_size[1] / block_factor;\n  int block_shape_W = grid_size[0] / block_factor;\n  int voxelidx, num;\n  int block_coor[2];\n  int startx, stopx, starty, stopy;\n  for (int i = 0; i < N; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n    if (failed) continue;\n    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      if (voxel_num >= max_voxels) continue;\n      voxel_num += 1;\n      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;\n      for (int k = 0; k < NDim; ++k) {\n        coors_rw(voxelidx, k) = coor[k];\n      }\n    }\n    num = num_points_per_voxel_rw(voxelidx);\n    if (num < max_points) {\n      for (int k = 0; k < num_features; ++k) {\n        voxels_rw(voxelidx, num, k) = points_rw(i, k);\n      }\n      block_coor[0] = coor[1] / block_factor;\n      block_coor[1] = coor[2] / block_factor;\n      mins_rw(block_coor[0], block_coor[1]) =\n          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));\n      maxs_rw(block_coor[0], block_coor[1]) =\n          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));\n      num_points_per_voxel_rw(voxelidx) += 1;\n    }\n  }\n  for (int i = 0; i < voxel_num; ++i) {\n    coor[1] = coors_rw(i, 1);\n    coor[2] = coors_rw(i, 2);\n    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;\n    block_coor[0] = coor[1] / block_factor;\n    block_coor[1] = coor[2] / block_factor;\n    min_value = mins_rw(block_coor[0], block_coor[1]);\n    max_value = maxs_rw(block_coor[0], block_coor[1]);\n    startx = std::max(0, block_coor[0] - block_size / 2);\n    stopx =\n        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);\n    starty = std::max(0, block_coor[1] - block_size / 2);\n    stopy =\n        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);\n\n    for (int j = startx; j < stopx; ++j) {\n      for (int k = starty; k < stopy; ++k) {\n        min_value = std::min(min_value, mins_rw(j, k));\n        max_value = std::max(max_value, maxs_rw(j, k));\n      }\n    }\n    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;\n  }\n  return voxel_num;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#ifndef SPARSE_REORDERING_FUNCTOR_H_\n#define SPARSE_REORDERING_FUNCTOR_H_\n#include <utils/spconv/tensorview/tensorview.h>\n\nnamespace functor {\ntemplate <typename Device, typename scalar_t, typename Index>\nstruct SparseGatherFunctor {\n  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,\n                  tv::TensorView<const scalar_t> features,\n                  tv::TensorView<const Index> indices, int size);\n};\n\ntemplate <typename Device, typename scalar_t, typename Index>\nstruct SparseScatterAddFunctor {\n  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,\n                  tv::TensorView<const scalar_t> buffer,\n                  tv::TensorView<const Index> indices, int size,\n                  bool stable = false);\n};\n}  // namespace functor\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh",
    "content": "#pragma once\nnamespace tv {\nnamespace detail {\n\ntemplate <typename scalar_t>\nclass KernelLoop {\n  struct Iterator {\n    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)\n        : index_(index), delta_(delta) {}\n    __forceinline__ __device__ scalar_t operator*() const { return index_; }\n    __forceinline__ __device__ Iterator &operator++() {\n      index_ += delta_;\n      return *this;\n    }\n    __forceinline__ __device__ bool operator!=(const Iterator &other) const {\n      bool greater = index_ > other.index_;\n      bool less = index_ < other.index_;\n      if (!other.delta_) {\n        return less;\n      }\n      if (!delta_) {\n        return greater;\n      }\n      return less || greater;\n    }\n\n   private:\n    scalar_t index_;\n    const scalar_t delta_;\n  };\n\n public:\n  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,\n                                        scalar_t end)\n      : begin_(begin), delta_(delta), end_(end) {}\n\n  __forceinline__ __device__ Iterator begin() const {\n    return Iterator{begin_, delta_};\n  }\n  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }\n\n private:\n  scalar_t begin_;\n  scalar_t delta_;\n  scalar_t end_;\n};\n\n}  // namespace detail\n\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,\n                                      gridDim.x * blockDim.x * NumILP, count);\n}\n\n// Helper to visit indices in the range 0 <= i < count using the y-coordinate.\n// Usage: for(int i : KernelLoopY(count)) { visit(i); }\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,\n                                      gridDim.y * blockDim.y * NumILP, count);\n}\n\n// Helper to visit indices in the range 0 <= i < count using the z-coordinate.\n// Usage: for(int i : KernelLoopZ(count)) { visit(i); }\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,\n                                      gridDim.z * blockDim.z * NumILP, count);\n}\n\n}  // namespace tv\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.muh",
    "content": "#pragma once\nnamespace tv {\nnamespace detail {\n\ntemplate <typename scalar_t>\nclass KernelLoop {\n  struct Iterator {\n    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)\n        : index_(index), delta_(delta) {}\n    __forceinline__ __device__ scalar_t operator*() const { return index_; }\n    __forceinline__ __device__ Iterator &operator++() {\n      index_ += delta_;\n      return *this;\n    }\n    __forceinline__ __device__ bool operator!=(const Iterator &other) const {\n      bool greater = index_ > other.index_;\n      bool less = index_ < other.index_;\n      if (!other.delta_) {\n        return less;\n      }\n      if (!delta_) {\n        return greater;\n      }\n      return less || greater;\n    }\n\n   private:\n    scalar_t index_;\n    const scalar_t delta_;\n  };\n\n public:\n  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,\n                                        scalar_t end)\n      : begin_(begin), delta_(delta), end_(end) {}\n\n  __forceinline__ __device__ Iterator begin() const {\n    return Iterator{begin_, delta_};\n  }\n  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }\n\n private:\n  scalar_t begin_;\n  scalar_t delta_;\n  scalar_t end_;\n};\n\n}  // namespace detail\n\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,\n                                      gridDim.x * blockDim.x * NumILP, count);\n}\n\n// Helper to visit indices in the range 0 <= i < count using the y-coordinate.\n// Usage: for(int i : KernelLoopY(count)) { visit(i); }\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,\n                                      gridDim.y * blockDim.y * NumILP, count);\n}\n\n// Helper to visit indices in the range 0 <= i < count using the z-coordinate.\n// Usage: for(int i : KernelLoopZ(count)) { visit(i); }\ntemplate <typename scalar_t, int NumILP = 1>\n__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(\n    scalar_t count) {\n  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,\n                                      gridDim.z * blockDim.z * NumILP, count);\n}\n\n}  // namespace tv\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h",
    "content": "#pragma once\n// from pytorch.aten\n#include \"tensorview.h\"\nnamespace tv {\nnamespace launch {\n\ntemplate <typename T1, typename T2>\ninline int DivUp(const T1 a, const T2 b) {\n  return (a + b - 1) / b;\n}\n\nconstexpr int CUDA_NUM_THREADS = 1024;\nconstexpr int MUSA_NUM_THREADS = 1024;\ninline int getBlocks(const int N) {\n  TV_ASSERT_RT_ERR(N > 0,\n                   \"CUDA kernel launch blocks must be positive, but got N=\", N);\n  return DivUp(N, CUDA_NUM_THREADS);\n}\n}  // namespace launch\n}  // namespace tv\n"
  },
  {
    "path": "mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n\n#include <algorithm>\n#include <cassert>\n#include <cstdlib>\n#include <iostream>\n#include <memory>\n#include <sstream>\n#include <type_traits>\n#include <vector>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nnamespace tv {\n\n#if defined(__NVCC__) || defined(__HIP__) || defined(__MUSA__)\n#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__\n#define TV_DEVICE_INLINE __forceinline__ __device__\n#define TV_HOST_DEVICE __device__ __host__\n#define TV_ASSERT(expr) assert(expr)\n#elif defined(__CUDACC_RTC__)\n#define TV_ASSERT(expr) assert(expr)\n#define TV_HOST_DEVICE_INLINE __forceinline__ __device__\n#define TV_DEVICE_INLINE __forceinline__ __device__\n#define TV_HOST_DEVICE __device__ __host__\n#else\n#define TV_ASSERT(x) assert(x)\n#define TV_HOST_DEVICE_INLINE inline\n#define TV_HOST_DEVICE\n#endif\n\n#define TV_REQUIRE(expr, ...) \\\n  {                           \\\n    if (!(expr)) {            \\\n      printf(__VA_ARGS__);    \\\n      assert(expr);           \\\n    }                         \\\n  }\n\n#define TV_DEVICE_REQUIRE(expr, ...)                      \\\n  {                                                       \\\n    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \\\n    assert(expr);                                         \\\n  }\n\ntemplate <class SStream, class T>\nvoid sstream_print(SStream &ss, T val) {\n  ss << val;\n}\n\ntemplate <class SStream, class T, class... TArgs>\nvoid sstream_print(SStream &ss, T val, TArgs... args) {\n  ss << val << \" \";\n  sstream_print(ss, args...);\n}\n\n#define TV_ASSERT_RT_ERR(expr, ...)                     \\\n  {                                                     \\\n    if (!(expr)) {                                      \\\n      std::stringstream __macro_s;                      \\\n      __macro_s << __FILE__ << \" \" << __LINE__ << \"\\n\"; \\\n      __macro_s << #expr << \" assert failed. \";         \\\n      tv::sstream_print(__macro_s, __VA_ARGS__);        \\\n      throw std::runtime_error(__macro_s.str());        \\\n    }                                                   \\\n  }\n\n#define TV_ASSERT_INVALID_ARG(expr, ...)                \\\n  {                                                     \\\n    if (!(expr)) {                                      \\\n      std::stringstream __macro_s;                      \\\n      __macro_s << __FILE__ << \" \" << __LINE__ << \"\\n\"; \\\n      __macro_s << #expr << \" assert failed. \";         \\\n      tv::sstream_print(__macro_s, __VA_ARGS__);        \\\n      throw std::invalid_argument(__macro_s.str());     \\\n    }                                                   \\\n  }\n\n#define TV_CHECK_CUDA_ERR()                                    \\\n  {                                                            \\\n    auto err = cudaGetLastError();                             \\\n    if (err != cudaSuccess) {                                  \\\n      std::stringstream __macro_s;                             \\\n      __macro_s << __FILE__ << \" \" << __LINE__ << \"\\n\";        \\\n      __macro_s << \"cuda execution failed with error \" << err; \\\n      throw std::runtime_error(__macro_s.str());               \\\n    }                                                          \\\n  }\n\n#define TV_CHECK_MUSA_ERR()                                    \\\n  {                                                            \\\n    auto err = musaGetLastError();                             \\\n    if (err != musaSuccess) {                                  \\\n      std::stringstream __macro_s;                             \\\n      __macro_s << __FILE__ << \" \" << __LINE__ << \"\\n\";        \\\n      __macro_s << \"musa execution failed with error \" << err; \\\n      throw std::runtime_error(__macro_s.str());               \\\n    }                                                          \\\n  }\n\nstruct CPU {};\n\n#define TV_MAX_DIM 6\n\ntemplate <typename scalar_t, size_t MaxDim = TV_MAX_DIM>\nstruct SimpleVector {\n public:\n  TV_HOST_DEVICE_INLINE SimpleVector() {};\n  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {\n    TV_ASSERT(q.size() <= MaxDim);\n    mSize = 0;\n    for (scalar_t s : q) {\n      mArray[mSize++] = s;\n    }\n    mSize = q.size();\n  }\n  SimpleVector(const std::vector<scalar_t> &arr) {\n    TV_ASSERT(arr.size() <= MaxDim);\n    for (size_t i = 0; i < arr.size(); ++i) {\n      mArray[i] = arr[i];\n    }\n    mSize = arr.size();\n  }\n  TV_HOST_DEVICE_INLINE SimpleVector(\n      const SimpleVector<scalar_t, MaxDim> &arr) {\n    TV_ASSERT(arr.size() <= MaxDim);\n    for (size_t i = 0; i < arr.size(); ++i) {\n      mArray[i] = arr[i];\n    }\n    mSize = arr.size();\n  }\n  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {\n#ifdef TV_DEBUG\n    TV_ASSERT(idx >= 0 && idx < mSize);\n#endif\n    return mArray[idx];\n  }\n  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {\n#ifdef TV_DEBUG\n    TV_ASSERT(idx >= 0 && idx < mSize);\n#endif\n    return mArray[idx];\n  }\n  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {\n#ifdef TV_DEBUG\n    TV_ASSERT(mSize < MaxDim);\n#endif\n    mArray[mSize] = s;\n    mSize++;\n  }\n  TV_HOST_DEVICE_INLINE void pop_back() {\n#ifdef TV_DEBUG\n    TV_ASSERT(mSize > 0);\n#endif\n    mSize--;\n  }\n\n  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }\n  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }\n  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }\n\n  typedef size_t size_type;\n\n  class iterator {\n   public:\n    typedef iterator self_type;\n    typedef scalar_t value_type;\n    typedef scalar_t &reference;\n    typedef scalar_t *pointer;\n    typedef std::forward_iterator_tag iterator_category;\n    typedef std::ptrdiff_t difference_type;\n    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}\n    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {\n      self_type i = *this;\n      ptr_++;\n      return i;\n    }\n    TV_HOST_DEVICE_INLINE self_type operator++() {\n      ptr_++;\n      return *this;\n    }\n    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }\n    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }\n    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {\n      return ptr_ == rhs.ptr_;\n    }\n    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {\n      return ptr_ != rhs.ptr_;\n    }\n\n   private:\n    pointer ptr_;\n  };\n\n  class const_iterator {\n   public:\n    typedef const_iterator self_type;\n    typedef scalar_t value_type;\n    typedef const scalar_t &reference;\n    typedef const scalar_t *pointer;\n    typedef std::ptrdiff_t difference_type;\n    typedef std::forward_iterator_tag iterator_category;\n    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}\n    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {\n      self_type i = *this;\n      ptr_++;\n      return i;\n    }\n    TV_HOST_DEVICE_INLINE self_type operator++() {\n      ptr_++;\n      return *this;\n    }\n    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }\n    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }\n    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {\n      return ptr_ == rhs.ptr_;\n    }\n    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {\n      return ptr_ != rhs.ptr_;\n    }\n\n   private:\n    pointer ptr_;\n  };\n\n  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }\n\n  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }\n\n  TV_HOST_DEVICE_INLINE const_iterator begin() const {\n    return const_iterator(mArray);\n  }\n\n  TV_HOST_DEVICE_INLINE const_iterator end() const {\n    return const_iterator(mArray + mSize);\n  }\n  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {\n    return const_iterator(mArray);\n  }\n\n  TV_HOST_DEVICE_INLINE const_iterator cend() const {\n    return const_iterator(mArray + mSize);\n  }\n\n protected:\n  scalar_t mArray[MaxDim];\n  size_t mSize = 0;\n};\n\ntemplate <typename scalar_t, size_t MaxDim>\nbool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,\n                const SimpleVector<scalar_t, MaxDim> &rfs) {\n  if (lfs.size() != rfs.size()) return false;\n  for (size_t i = 0; i < lfs.size(); ++i) {\n    if (lfs[i] != rfs[i]) return false;\n  }\n  return true;\n}\n\ntemplate <typename scalar_t, size_t MaxDim>\nbool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,\n                const SimpleVector<scalar_t, MaxDim> &rfs) {\n  return !(lfs == rfs);\n}\n\nstruct Slice {\n  template <class... Integers>\n  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {\n    static_assert(sizeof...(ints) <= 3, \"slice init must smaller than 3\");\n    SimpleVector<int, 3> slices{int(ints)...};\n    mSlices[0] = -1;\n    mSlices[1] = -1;\n    mSlices[2] = -1;\n    for (size_t i = 0; i < slices.size(); ++i) {\n      mSlices[i] = slices[i];\n    }\n  }\n\n  TV_HOST_DEVICE_INLINE Slice() {\n    mSlices[0] = -1;\n    mSlices[1] = -1;\n    mSlices[2] = -1;\n  }\n  template <typename scalar_t>\n  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {\n    mSlices[0] = -1;\n    mSlices[1] = -1;\n    mSlices[2] = -1;\n    TV_ASSERT(slice.size() <= 3);\n    int idx = 0;\n    for (scalar_t s : slice) {\n      mSlices[idx] = int(s);\n      ++idx;\n    }\n  }\n  TV_HOST_DEVICE_INLINE int &operator[](int idx) {\n#ifdef TV_DEBUG\n    TV_ASSERT(idx >= 0 && idx < 3);\n#endif\n    return mSlices[idx];\n  }\n  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {\n#ifdef TV_DEBUG\n    TV_ASSERT(idx >= 0 && idx < 3);\n#endif\n    return mSlices[idx];\n  }\n\n protected:\n  int mSlices[3];\n};\n\ntemplate <size_t MaxDim = TV_MAX_DIM>\nstruct ShapeBase : public SimpleVector<int, MaxDim> {\n  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>() {};\n  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)\n      : SimpleVector<int, MaxDim>(shape) {}\n\n  // TODO: find out why this template can no be used on windows\n  // template <typename scalar_t, template <class...> class Container>\n  // ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}\n  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)\n      : SimpleVector<int, MaxDim>(shape) {}\n  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}\n\n  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;\n  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {\n#ifdef TV_DEBUG\n    TV_ASSERT(start >= 0 && end < this->mSize && end > start);\n#endif\n    ShapeBase<MaxDim> shape;\n    for (int i = start; i < end; ++i) {\n      shape.push_back(this->mArray[i]);\n    }\n    return shape;\n  }\n  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {\n#ifdef TV_DEBUG\n    TV_ASSERT(start >= 0 && start <= this->mSize);\n#endif\n    ShapeBase<MaxDim> shape;\n    for (int i = start; i < this->mSize; ++i) {\n      shape.push_back(this->mArray[i]);\n    }\n    return shape;\n  }\n\n  TV_HOST_DEVICE_INLINE size_t size() const {\n    if (this->mSize == 0) return 0;\n    size_t s = 1;\n    for (int i = 0; i < int(this->mSize); ++i) {\n      s *= this->mArray[i];\n    }\n    return s;\n  }\n  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }\n  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {\n    ShapeBase<MaxDim> shape;\n    for (int i = 0; i < this->mSize; ++i) {\n      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);\n    }\n    return shape;\n  }\n  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {\n    ShapeBase<MaxDim> shape;\n    for (int i = 0; i < this->mSize; ++i) {\n      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);\n    }\n    return shape;\n  }\n};\n\nusing Shape = ShapeBase<TV_MAX_DIM>;\n\ntemplate <class... Inds>\nTV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,\n                                           Inds... indexes) {\n  unsigned offset = 0;\n  unsigned m = 1;\n  int indexes_vec[sizeof...(indexes)] = {indexes...};\n#ifdef TV_DEBUG\n  TV_ASSERT(sizeof...(indexes) == shape.size());\n#endif\n#pragma unroll\n  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {\n    offset += m * indexes_vec[i];\n    m *= shape[i];\n  }\n  return offset;\n}\n\nTV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,\n                                           std::vector<int> &indexes_vec) {\n  unsigned offset = 0;\n  unsigned m = 1;\n  for (int i = shape.size() - 1; i >= 0; --i) {\n    offset += m * indexes_vec[i];\n    m *= shape[i];\n  }\n  return offset;\n}\n\ntemplate <class... Inds>\nTV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,\n                                           Inds... indexes) {\n  unsigned offset = 0;\n  unsigned m = 1;\n  int indexes_vec[sizeof...(indexes)] = {indexes...};\n#pragma unroll\n  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {\n    offset += m * indexes_vec[i];\n    m *= shape[i];\n  }\n  return offset;\n}\n\nTV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,\n                                           const Shape &indexes_vec) {\n  unsigned offset = 0;\n  unsigned m = 1;\n  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {\n    offset += m * indexes_vec[i];\n    m *= shape[i];\n  }\n  return offset;\n}\n\ntemplate <typename Index, unsigned NDim>\nTV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,\n                                           const Index *shape) {\n  unsigned offset = 0;\n  unsigned m = 1;\n#pragma unroll\n  for (int i = NDim - 1; i >= 0; --i) {\n    offset += m * indexes[i];\n    m *= shape[i];\n  }\n  return offset;\n}\n\ntemplate <typename Index, unsigned NDim>\nTV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,\n                                           const Index *shape) {\n#pragma unroll\n  for (int i = NDim - 1; i >= 0; --i) {\n    output[i] = index % shape[i];\n    index -= output[i];\n    index /= shape[i];\n  }\n  return index;\n}\n\ntemplate <int N>\nstruct ArrayIndexRowMajor {\n  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,\n                                            const Shape &indexes) {\n    return indexes[N - 1] +\n           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);\n  }\n};\n\ntemplate <>\nstruct ArrayIndexRowMajor<0> {\n  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,\n                                            const Shape &indexes) {\n    return 0;\n  }\n};\n\nnamespace detail {\ntemplate <typename scalar_t>\nconstexpr const char *simpleTypeName(scalar_t val = scalar_t());\ntemplate <>\nconstexpr const char *simpleTypeName(float val) {\n  return \"float32\";\n}\ntemplate <>\nconstexpr const char *simpleTypeName(double val) {\n  return \"float64\";\n}\ntemplate <>\nconstexpr const char *simpleTypeName(int val) {\n  return \"int32\";\n}\ntemplate <>\nconstexpr const char *simpleTypeName(unsigned val) {\n  return \"uint32\";\n}\ntemplate <>\nconstexpr const char *simpleTypeName(long val) {\n  return \"int64\";\n}\ntemplate <>\nconstexpr const char *simpleTypeName(unsigned long val) {\n  return \"uint64\";\n}\n};  // namespace detail\n\ntemplate <typename scalar_t, int Rank = -1>\nstruct TensorView {\n  TV_HOST_DEVICE_INLINE TensorView() {}\n  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)\n      : mPtr(ptr), mShape(shape) {}\n  template <class... Integers>\n  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)\n      : mPtr(ptr) {\n    mShape = {int(shapes)...};\n  }\n\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(\n      const TensorView<scalar_t, Rank> &tensor) {\n    TV_REQUIRE(tensor.shape() == shape(), \"you must provide same input size%s\",\n               \"\\n\");\n    scalar_t *ptr = mPtr;\n    const scalar_t *other_ptr = tensor.data();\n    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);\n    return *this;\n  }\n\n  template <typename T1>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(\n      std::initializer_list<T1> seq) {\n    TV_REQUIRE(seq.size() == size(), \"you must provide same input size%s\",\n               \"\\n\");\n    scalar_t *ptr = mPtr;\n    for (const T1 &s : seq) *(ptr++) = scalar_t(s);\n    return *this;\n  }\n\n  template <class... Inds>\n  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {\n#ifdef TV_DEBUG\n    int idxes[sizeof...(Inds)]{int(inds)...};\n    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),\n               \"you provide %d indexes, but dim is %d\\n\", sizeof...(inds),\n               mShape.ndim());\n    for (int i = 0; i < sizeof...(inds); ++i) {\n      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],\n                 \"index-%d(%d) out-of-range: [0, %d)\\n\", i, idxes[i],\n                 mShape[i]);\n    }\n#endif\n    return mPtr[rowArrayIdx(mShape, int(inds)...)];\n  }\n  template <class... Inds>\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {\n#ifdef TV_DEBUG\n    int idxes[sizeof...(Inds)]{int(inds)...};\n    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),\n               \"you provide %d indexes, but dim is %d\\n\", sizeof...(inds),\n               mShape.ndim());\n    for (int i = 0; i < sizeof...(inds); ++i) {\n      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],\n                 \"index-%d(%d) out-of-range: [0, %d)\\n\", i, idxes[i],\n                 mShape[i]);\n    }\n#endif\n    return mPtr[rowArrayIdx(mShape, int(inds)...)];\n  }\n  TV_HOST_DEVICE_INLINE scalar_t &operator()() {\n#if defined TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mPtr != nullptr,\n                      \"you want get value but the view is empty.%s\", \"\\n\");\n    TV_DEVICE_REQUIRE(mShape.ndim() == 0,\n                      \"you provide 0 indexes, but dim is %ld\\n\", mShape.ndim());\n#else\n    TV_REQUIRE(mPtr != nullptr, \"you want get value but the view is empty.%s\",\n               \"\\n\");\n    TV_REQUIRE(mShape.ndim() == 0, \"you provide 0 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n#endif\n#endif\n    return mPtr[0];\n  }\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {\n#if defined TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mPtr != nullptr,\n                      \"you want get value but the view is empty.%s\", \"\\n\");\n    TV_DEVICE_REQUIRE(mShape.ndim() == 0,\n                      \"you provide 0 indexes, but dim is %ld\\n\", mShape.ndim());\n#else\n    TV_REQUIRE(mPtr != nullptr, \"you want get value but the view is empty.%s\",\n               \"\\n\");\n    TV_REQUIRE(mShape.ndim() == 0, \"you provide 0 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n#endif\n#endif\n    return mPtr[0];\n  }\n\n  template <class T1>\n  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {\n#if defined TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 1,\n                      \"you provide 1 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, i1, mShape[0]);\n#else\n    TV_REQUIRE(mShape.ndim() == 1, \"you provide 1 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, i1, mShape[0]);\n#endif\n#endif\n    return mPtr[i1];\n  }\n  template <class T1, class T2>\n  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 2,\n                      \"you provide 2 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n#else\n    TV_REQUIRE(mShape.ndim() == 2, \"you provide 2 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n#endif\n#endif\n    return mPtr[i1 * mShape[1] + i2];\n  }\n  template <class T1, class T2, class T3>\n  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 3,\n                      \"you provide 3 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3),\n                      mShape[2]);\n#else\n    TV_REQUIRE(mShape.ndim() == 3, \"you provide 3 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3), mShape[2]);\n#endif\n#endif\n    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];\n  }\n  template <class T1, class T2, class T3, class T4>\n  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 4,\n                      \"you provide 4 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3),\n                      mShape[2]);\n    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 3, int(i4),\n                      mShape[3]);\n#else\n    TV_REQUIRE(mShape.ndim() == 4, \"you provide 4 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3), mShape[2]);\n    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 3, int(i4), mShape[3]);\n#endif\n#endif\n    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];\n  }\n\n  template <class T1>\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 1,\n                      \"you provide 1 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n#else\n    TV_REQUIRE(mShape.ndim() == 1, \"you provide 1 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n#endif\n#endif\n    return mPtr[i1];\n  }\n  template <class T1, class T2>\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 2,\n                      \"you provide 2 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n#else\n    TV_REQUIRE(mShape.ndim() == 2, \"you provide 2 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n\n#endif\n#endif\n    return mPtr[i1 * mShape[1] + i2];\n  }\n  template <class T1, class T2, class T3>\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 3,\n                      \"you provide 3 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3),\n                      mShape[2]);\n#else\n    TV_REQUIRE(mShape.ndim() == 3, \"you provide 3 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3), mShape[2]);\n#endif\n#endif\n    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];\n  }\n  template <class T1, class T2, class T3, class T4>\n  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,\n                                                   T4 i4) const {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(mShape.ndim() == 4,\n                      \"you provide 4 indexes, but dim is %ld\\n\", mShape.ndim());\n    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1),\n                      mShape[0]);\n    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2),\n                      mShape[1]);\n    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3),\n                      mShape[2]);\n    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],\n                      \"index-%d(%d) out-of-range: [0, %d)\\n\", 3, int(i4),\n                      mShape[3]);\n#else\n    TV_REQUIRE(mShape.ndim() == 4, \"you provide 4 indexes, but dim is %ld\\n\",\n               mShape.ndim());\n    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 0, int(i1), mShape[0]);\n    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 1, int(i2), mShape[1]);\n    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 2, int(i3), mShape[2]);\n    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],\n               \"index-%d(%d) out-of-range: [0, %d)\\n\", 3, int(i4), mShape[3]);\n#endif\n#endif\n    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];\n  }\n\n  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {\n#ifdef TV_DEBUG\n#if defined(__CUDA_ARCH__)\n    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),\n                      \"index(%d) out-of-range: [0, %ld)\\n\", int(idx), size());\n#else\n    TV_REQUIRE(idx >= 0 && idx < size(), \"index(%d) out-of-range: [0, %ld)\\n\",\n               int(idx), size());\n#endif\n#endif\n    return mPtr[idx];\n  }\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](\n      SimpleVector<Slice> slice_vec) {\n    return _subview(slice_vec);\n  }\n  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](\n      SimpleVector<Slice> slice_vec) const {\n    return _subview(slice_vec);\n  }\n  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }\n  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }\n  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }\n  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }\n  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }\n  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }\n  template <class... Inds>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {\n    Shape shapes{int(newShapes)...};\n    TV_ASSERT(shapes.size() == size());\n    mShape = shapes;\n    return *this;\n  }\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {\n    TV_ASSERT(shapes.size() == size());\n    mShape = shapes;\n    return *this;\n  }\n  template <class... Inds>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(\n      Inds... newShapes) const {\n    Shape shapes{int(newShapes)...};\n    for (size_t i = 0; i < shapes.ndim(); ++i) {\n      if (shapes[i] == -1) {\n        shapes[i] = 1;\n        shapes[i] = size() / shapes.size();\n        break;\n      }\n    }\n    TV_ASSERT(shapes.size() == size());\n    return TensorView<scalar_t, Rank>(mPtr, shapes);\n  }\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {\n    TV_ASSERT(shapes.size() == size());\n    return TensorView<scalar_t, Rank>(mPtr, shapes);\n  }\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {\n    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());\n  }\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {\n    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));\n  }\n  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }\n\n  template <class... Slices>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(\n      Slice slice, Slices... slices) const {\n    return subview<float, Slice, Slices...>(slice, slices...);\n  }\n  template <class T2 = float, class... Slices>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(\n      Slices... slices) const {\n    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};\n    Shape new_shape{to_slice(slices)[0]...};\n    Shape start{to_slice(slices)[0]...};\n    TV_ASSERT(new_shape.ndim() <= mShape.ndim());\n    TV_ASSERT(new_shape.ndim() != 0);\n    size_t idxsize = new_shape.ndim();\n    for (size_t i = idxsize; i < mShape.ndim(); ++i) {\n      new_shape.push_back(0);\n      start.push_back(0);\n    }\n#pragma unroll\n    for (size_t i = 0; i < sizeof...(Slices); ++i) {\n      if (slice_vec[i][1] != -1) {\n        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];\n        TV_ASSERT(new_shape[i] >= 0);\n      } else {\n        new_shape[i] = 1;\n      }\n    }\n    auto offset = rowArrayIdx(mShape, start);\n#pragma unroll\n    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {\n      new_shape[i] = mShape[i];\n      TV_ASSERT(new_shape[i] >= 0);\n    }\n    Shape reduced_shape;\n#pragma unroll\n    for (size_t i = 0; i < sizeof...(Slices); ++i) {\n      if (slice_vec[i][1] != -1) {\n        reduced_shape.push_back(new_shape[i]);\n      }\n    }\n#pragma unroll\n    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {\n      reduced_shape.push_back(new_shape[i]);\n    }\n    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);\n  }\n\n  template <class... Integers>\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,\n                                                           Integers... ints) {\n    Shape start = {id, ints...};\n    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {\n      start.push_back(0);\n    }\n    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),\n                                      mShape.subshape(sizeof...(ints) + 1));\n  }\n\n  std::string repr() const {\n    std::ostringstream ss;\n    if (empty()) return \"\";\n    if (mShape.ndim() == 0) {\n      ss << *mPtr;\n      ss << \"Tensor: dtype=\" << detail::simpleTypeName<scalar_t>();\n      return ss.str();\n    }\n    Shape counter = mShape;\n    auto tensor_flat = this->view(-1);\n    for (int i = 0; i < counter.ndim(); ++i) {\n      counter[i] = 0;\n      ss << \"[\";\n    }\n    for (size_t i = 0; i < this->size(); ++i) {\n      ss << tensor_flat(rowArrayIdx(mShape, counter));\n      counter[counter.ndim() - 1] += 1;\n      int inc_count = 0;\n      bool print_comma = true;\n      for (int c = counter.ndim() - 1; c >= 0; --c) {\n        if (counter[c] == this->dim(c) && c > 0) {\n          ++inc_count;\n          counter[c - 1] += 1;\n          counter[c] = 0;\n          print_comma = false;\n        }\n      }\n      if (print_comma && i != this->size() - 1) ss << \", \";\n      for (int j = 0; j < inc_count; ++j) {\n        ss << \"]\";\n      }\n      if (i != this->size() - 1) {\n        if (inc_count != 0) ss << \"\\n\";\n        for (int j = 0; j < inc_count; ++j) {\n          ss << \"[\";\n        }\n      }\n    }\n    ss << \"]\";\n    ss << \"Tensor: dtype=\" << detail::simpleTypeName<scalar_t>();\n    return ss.str();\n  }\n\n protected:\n  // TODO: make this function public.\n  // currently this function is called unexpectedly when using subview({0, 0}).\n  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(\n      SimpleVector<Slice> slice_vec) {\n    Shape new_shape;\n    for (int i = 0; i < slice_vec.size(); ++i) {\n      new_shape.push_back(slice_vec[i][0]);\n    }\n    Shape start = new_shape;\n    TV_ASSERT(new_shape.ndim() <= mShape.ndim());\n    TV_ASSERT(new_shape.ndim() != 0);\n    size_t idxsize = new_shape.ndim();\n    for (size_t i = idxsize; i < mShape.ndim(); ++i) {\n      new_shape.push_back(0);\n      start.push_back(0);\n    }\n    for (size_t i = 0; i < slice_vec.size(); ++i) {\n      if (slice_vec[i][1] != -1) {\n        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];\n        TV_ASSERT(new_shape[i] >= 0);\n      } else {\n        new_shape[i] = 1;  // reduce dim\n      }\n    }\n    auto offset = rowArrayIdx(mShape, start);\n    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {\n      new_shape[i] = mShape[i];\n      TV_ASSERT(new_shape[i] >= 0);\n    }\n    Shape reduced_shape;\n    for (size_t i = 0; i < slice_vec.size(); ++i) {\n      if (slice_vec[i][1] != -1) {\n        reduced_shape.push_back(new_shape[i]);\n      }\n    }\n    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {\n      reduced_shape.push_back(new_shape[i]);\n    }\n    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);\n  }\n  template <typename T1>\n  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {\n    return Slice{int(s), -1, -1};\n  }\n\n  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }\n\n  scalar_t *mPtr = nullptr;\n  Shape mShape;\n};\n\ntemplate <typename Os, typename scalar_t, int Rank>\nOs &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {\n  os << dt.repr();\n  return os;\n}\n\ntemplate <typename Os, typename scalar_t, int Rank>\nOs &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {\n  os << dt.repr();\n  return os;\n}\n\nnamespace detail {\ntemplate <typename scalar_t>\nconstexpr const char *printfTypeFormat(scalar_t val = scalar_t());\ntemplate <>\nconstexpr const char *printfTypeFormat(float val) {\n  return \"%.2f\";\n}\ntemplate <>\nconstexpr const char *printfTypeFormat(double val) {\n  return \"%.2f\";\n}\ntemplate <>\nconstexpr const char *printfTypeFormat(int val) {\n  return \"%d\";\n}\ntemplate <>\nconstexpr const char *printfTypeFormat(unsigned val) {\n  return \"%u\";\n}\ntemplate <>\nconstexpr const char *printfTypeFormat(long val) {\n  return \"%ld\";\n}\ntemplate <>\nconstexpr const char *printfTypeFormat(unsigned long val) {\n  return \"%lu\";\n}\n};  // namespace detail\n\ntemplate <typename scalar_t>\nTV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,\n                                    const char *format) {\n  if (tensor.empty()) return;\n  if (tensor.ndim() == 0) {\n    printf(format, tensor());\n    printf(\"\\n\");\n    return;\n  }\n  Shape counter = tensor.shape();\n  auto tensor_flat = tensor.view(-1);\n  for (int i = 0; i < counter.ndim(); ++i) {\n    counter[i] = 0;\n    printf(\"[\");\n  }\n  for (size_t i = 0; i < tensor.size(); ++i) {\n    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));\n    counter[counter.ndim() - 1] += 1;\n    int inc_count = 0;\n    bool print_comma = true;\n    for (int c = counter.ndim() - 1; c >= 0; --c) {\n      if (counter[c] == tensor.dim(c) && c > 0) {\n        ++inc_count;\n        counter[c - 1] += 1;\n        counter[c] = 0;\n        print_comma = false;\n      }\n    }\n    if (print_comma && i != tensor.size() - 1) printf(\", \");\n    for (int j = 0; j < inc_count; ++j) {\n      printf(\"]\");\n    }\n    if (i != tensor.size() - 1) {\n      if (inc_count != 0) printf(\"\\n\");\n      for (int j = 0; j < inc_count; ++j) {\n        printf(\"[\");\n      }\n    }\n  }\n  printf(\"]\\n\");\n}\n\ntemplate <typename scalar_t>\nTV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {\n  using Traw = typename std::remove_const<scalar_t>::type;\n  return printTensorView(tensor, detail::printfTypeFormat<Traw>());\n}\ntemplate <typename scalar_t>\nTV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {\n  using Traw = typename std::remove_const<scalar_t>::type;\n  return printTensorView(TensorView<const scalar_t>(ptr, shape),\n                         detail::printfTypeFormat<Traw>());\n}\ntemplate <typename scalar_t>\nTV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,\n                                    const char *format) {\n  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);\n}\n\n}  // namespace tv\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/active_rotated_filter.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output) {\n  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,\n                       output);\n}\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in) {\n  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,\n                       grad_in);\n}\n\nvoid active_rotated_filter_forward(const Tensor input, const Tensor indices,\n                                   Tensor output) {\n  active_rotated_filter_forward_impl(input, indices, output);\n}\n\nvoid active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,\n                                    Tensor grad_in) {\n  active_rotated_filter_backward_impl(grad_out, indices, grad_in);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"active_rotated_filter_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid active_rotated_filter_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto input = buildATensor(ctx, ins[0]);\n  auto indices = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  active_rotated_filter_forward(input, indices, output);\n}\n\nvoid active_rotated_filter_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto grad_out = buildATensor(ctx, ins[0]);\n  auto indices = buildATensor(ctx, ins[1]);\n  auto grad_in = buildATensor(ctx, outs[0]);\n  active_rotated_filter_backward(grad_out, indices, grad_in);\n}\n#endif\n\nvoid active_rotated_filter_forward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto input = buildATensor(ctx, ins[0]);\n  auto indices = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  active_rotated_filter_forward(input, indices, output);\n}\n\nvoid active_rotated_filter_backward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto grad_out = buildATensor(ctx, ins[0]);\n  auto indices = buildATensor(ctx, ins[1]);\n  auto grad_in = buildATensor(ctx, outs[0]);\n  active_rotated_filter_backward(grad_out, indices, grad_in);\n}\n\nPARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)\n    .input(2)\n    .output(1)\n    .apply(active_rotated_filter_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(active_rotated_filter_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)\n    .input(2)\n    .output(1)\n    .apply(active_rotated_filter_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(active_rotated_filter_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H\n#define ACTIVE_ROTATED_FILTER_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid active_rotated_filter_forward(const Tensor input, const Tensor indices,\n                                   Tensor output);\n\nvoid active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,\n                                    Tensor grad_in);\n\n#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/assign_score_withk.cpp",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor& points,\n                                     const Tensor& centers,\n                                     const Tensor& scores,\n                                     const Tensor& knn_idx, Tensor& output) {\n  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,\n                       aggregate, points, centers, scores, knn_idx, output);\n}\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,\n                       aggregate, grad_out, points, centers, scores, knn_idx,\n                       grad_points, grad_centers, grad_scores);\n}\n\nvoid assign_score_withk_forward(const Tensor& points, const Tensor& centers,\n                                const Tensor& scores, const Tensor& knn_idx,\n                                Tensor& output, int B, int N0, int N1, int M,\n                                int K, int O, int aggregate) {\n  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,\n                                  centers, scores, knn_idx, output);\n}\n\nvoid assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,\n                                 const Tensor& centers, const Tensor& scores,\n                                 const Tensor& knn_idx, Tensor& grad_points,\n                                 Tensor& grad_centers, Tensor& grad_scores,\n                                 int B, int N0, int N1, int M, int K, int O,\n                                 int aggregate) {\n  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,\n                                   points, centers, scores, knn_idx,\n                                   grad_points, grad_centers, grad_scores);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"assign_score_withk_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid assign_score_withk_forward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  int B, N0, N1, M, K, O, aggregate;\n  SSAttrs(attr)\n      .get<int>(\"B\", B)\n      .get<int>(\"N0\", N0)\n      .get<int>(\"N1\", N1)\n      .get<int>(\"M\", M)\n      .get<int>(\"K\", K)\n      .get<int>(\"O\", O)\n      .get<int>(\"aggregate\", aggregate)\n      .done();\n\n  const auto& points = buildATensor(ctx, ins[0]);\n  const auto& centers = buildATensor(ctx, ins[1]);\n  const auto& scores = buildATensor(ctx, ins[2]);\n  const auto& knn_idx = buildATensor(ctx, ins[3]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,\n                             N1, M, K, O, aggregate);\n}\n\nvoid assign_score_withk_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int B, N0, N1, M, K, O, aggregate;\n  SSAttrs(attr)\n      .get<int>(\"B\", B)\n      .get<int>(\"N0\", N0)\n      .get<int>(\"N1\", N1)\n      .get<int>(\"M\", M)\n      .get<int>(\"K\", K)\n      .get<int>(\"O\", O)\n      .get<int>(\"aggregate\", aggregate)\n      .done();\n\n  const auto& grad_out = buildATensor(ctx, ins[0]);\n  const auto& points = buildATensor(ctx, ins[1]);\n  const auto& centers = buildATensor(ctx, ins[2]);\n  const auto& scores = buildATensor(ctx, ins[3]);\n  const auto& knn_idx = buildATensor(ctx, ins[4]);\n\n  auto grad_points = buildATensor(ctx, outs[0]);\n  auto grad_centers = buildATensor(ctx, outs[1]);\n  auto grad_scores = buildATensor(ctx, outs[2]);\n  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,\n                              grad_points, grad_centers, grad_scores, B, N0, N1,\n                              M, K, O, aggregate);\n}\n\nPARROTS_EXTENSION_REGISTER(assign_score_withk_forward)\n    .attr(\"B\")\n    .attr(\"N0\")\n    .attr(\"N1\")\n    .attr(\"M\")\n    .attr(\"K\")\n    .attr(\"O\")\n    .attr(\"aggregate\")\n    .input(4)\n    .output(1)\n    .apply(assign_score_withk_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(assign_score_withk_backward)\n    .attr(\"B\")\n    .attr(\"N0\")\n    .attr(\"N1\")\n    .attr(\"M\")\n    .attr(\"K\")\n    .attr(\"O\")\n    .attr(\"aggregate\")\n    .input(5)\n    .output(3)\n    .apply(assign_score_withk_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H\n#define ASSIGN_SCORE_WITHK_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid assign_score_withk_forward(const Tensor& points, const Tensor& centers,\n                                const Tensor& scores, const Tensor& knn_idx,\n                                Tensor& output, int B, int N0, int N1, int M,\n                                int K, int O, int aggregate);\n\nvoid assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,\n                                 const Tensor& centers, const Tensor& scores,\n                                 const Tensor& knn_idx, Tensor& grad_points,\n                                 Tensor& grad_centers, Tensor& grad_scores,\n                                 int B, int N0, int N1, int M, int K, int O,\n                                 int aggregate);\n\n#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/ball_query._parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"ball_query_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid ball_query_parrots(CudaContext& ctx, const SSElement& attr,\n                        const OperatorBase::in_list_t& ins,\n                        OperatorBase::out_list_t& outs) {\n  int b, n, m, nsample;\n  float min_radius, max_radius;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"n\", n)\n      .get<int>(\"m\", m)\n      .get<int>(\"nsample\", nsample)\n      .get<float>(\"min_radius\", min_radius)\n      .get<float>(\"max_radius\", max_radius)\n      .done();\n\n  const auto& center_xyz = buildATensor(ctx, ins[0]);\n  const auto& xyz = buildATensor(ctx, ins[1]);\n  auto idx = buildATensor(ctx, outs[0]);\n  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,\n                     nsample);\n}\n\nPARROTS_EXTENSION_REGISTER(ball_query_forward)\n    .attr(\"b\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .attr(\"nsample\")\n    .attr(\"min_radius\")\n    .attr(\"max_radius\")\n    .input(2)\n    .output(1)\n    .apply(ball_query_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/ball_query.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx) {\n  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,\n                       nsample, new_xyz, xyz, idx);\n}\n\nvoid ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,\n                        Tensor idx_tensor, int b, int n, int m,\n                        float min_radius, float max_radius, int nsample) {\n  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,\n                          new_xyz_tensor, xyz_tensor, idx_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/ball_query_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BALL_QUERY_PYTORCH_H\n#define BALL_QUERY_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,\n                        int b, int n, int m, float min_radius, float max_radius,\n                        int nsample);\n\n#endif  // BALL_QUERY_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/bbox_overlaps.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset) {\n  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,\n                       aligned, offset);\n}\n\nvoid bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                   const int mode, const bool aligned, const int offset) {\n  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"bbox_overlaps_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\n/*\n * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor\n * ious, const int mode, const bool aligned, const int offset);\n */\nvoid bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,\n                           const OperatorBase::in_list_t& ins,\n                           OperatorBase::out_list_t& outs) {\n  int mode, offset;\n  bool aligned;\n  SSAttrs(attr)\n      .get<int>(\"mode\", mode)\n      .get<bool>(\"aligned\", aligned)\n      .get<int>(\"offset\", offset)\n      .done();\n\n  const auto& bboxes1 = buildATensor(ctx, ins[0]);\n  const auto& bboxes2 = buildATensor(ctx, ins[1]);\n  auto ious = buildATensor(ctx, outs[0]);\n  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n\nPARROTS_EXTENSION_REGISTER(bbox_overlaps)\n    .attr(\"mode\")\n    .attr(\"aligned\")\n    .attr(\"offset\")\n    .input(2)\n    .output(1)\n    .apply(bbox_overlaps_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BBOX_OVERLAPS_PYTORCH_H\n#define BBOX_OVERLAPS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\n\n#endif  // BBOX_OVERLAPS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/border_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid border_align_forward_impl(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size) {\n  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,\n                       argmax_idx, pool_size);\n}\n\nvoid border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size) {\n  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,\n                       argmax_idx, grad_input, pool_size);\n}\n\nvoid border_align_forward(const Tensor &input, const Tensor &boxes,\n                          Tensor output, Tensor argmax_idx,\n                          const int pool_size) {\n  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);\n}\n\nvoid border_align_backward(const Tensor &grad_output, const Tensor &boxes,\n                           const Tensor &argmax_idx, Tensor grad_input,\n                           const int pool_size) {\n  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,\n                             pool_size);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/border_align_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"border_align_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  int pool_size;\n  SSAttrs(attr).get<int>(\"pool_size\", pool_size).done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& boxes = buildATensor(ctx, ins[1]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  auto argmax_idx = buildATensor(ctx, outs[1]);\n  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);\n}\n\nvoid border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  int pool_size;\n  SSAttrs(attr).get<int>(\"pool_size\", pool_size).done();\n\n  const auto& top_grad = buildATensor(ctx, ins[0]);\n  const auto& boxes = buildATensor(ctx, ins[1]);\n  const auto& argmax_idx = buildATensor(ctx, ins[2]);\n\n  auto bottom_grad = buildATensor(ctx, outs[0]);\n  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,\n                             pool_size);\n}\n\nPARROTS_EXTENSION_REGISTER(border_align_forward)\n    .attr(\"pool_size\")\n    .input(2)\n    .output(2)\n    .apply(border_align_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(border_align_backward)\n    .attr(\"pool_size\")\n    .input(3)\n    .output(1)\n    .apply(border_align_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/border_align_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BORDER_ALIGN_PYTORCH_H\n#define BORDER_ALIGN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\n#ifdef MMCV_WITH_CUDA\nvoid border_align_forward_cuda(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size);\n\nvoid border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size);\n#endif\n\n#endif  // BORDER_ALIGN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/box_iou_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned) {\n  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,\n                       aligned);\n}\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nvoid box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                     const int mode_flag, const bool aligned) {\n  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"box_iou_rotated_pytorch.h\"\n\nusing namespace parrots;\n\n/*\n * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor\n * ious, const int mode_flag, const bool aligned);\n */\nvoid box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                 const OperatorBase::in_list_t& ins,\n                                 OperatorBase::out_list_t& outs) {\n  bool aligned;\n  int mode_flag;\n  SSAttrs(attr)\n      .get<bool>(\"aligned\", aligned)\n      .get<int>(\"mode_flag\", mode_flag)\n      .done();\n\n  const auto& boxes1 = buildATensor(ctx, ins[0]);\n  const auto& boxes2 = buildATensor(ctx, ins[1]);\n  auto ious = buildATensor(ctx, outs[0]);\n  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);\n}\n\n#ifdef MMCV_WITH_CUDA\n/*\n * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor\n * ious, const int mode_flag, const bool aligned);\n */\nvoid box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                  const OperatorBase::in_list_t& ins,\n                                  OperatorBase::out_list_t& outs) {\n  bool aligned;\n  int mode_flag;\n  SSAttrs(attr)\n      .get<bool>(\"aligned\", aligned)\n      .get<int>(\"mode_flag\", mode_flag)\n      .done();\n\n  const auto& boxes1 = buildATensor(ctx, ins[0]);\n  const auto& boxes2 = buildATensor(ctx, ins[1]);\n  auto ious = buildATensor(ctx, outs[0]);\n  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);\n}\n#endif\n\nPARROTS_EXTENSION_REGISTER(box_iou_rotated)\n    .attr(\"aligned\")\n    .attr(\"mode_flag\")\n    .input(2)\n    .output(1)\n    .apply(box_iou_rotated_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(box_iou_rotated_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef BOX_IOU_ROTATED_PYTORCH_H\n#define BOX_IOU_ROTATED_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\n\n#ifdef MMCV_WITH_CUDA\nvoid box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n#endif\n\n#endif  // BOX_IOU_ROTATED_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,\n                       rmasks, output, kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,\n                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,\n                       bottom_grad, mask_grad, kernel_size, group_size,\n                       scale_factor);\n}\n\nvoid carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,\n                    Tensor routput, Tensor rmasks, Tensor output,\n                    int kernel_size, int group_size, int scale_factor) {\n  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,\n                      kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                     Tensor rtop_grad, Tensor rbottom_grad_hs,\n                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,\n                     Tensor mask_grad, int kernel_size, int group_size,\n                     int scale_factor) {\n  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,\n                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,\n                       kernel_size, group_size, scale_factor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe_naive.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,\n                       kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,\n                       bottom_grad, mask_grad, kernel_size, group_size,\n                       scale_factor);\n}\n\nvoid carafe_naive_forward(Tensor features, Tensor masks, Tensor output,\n                          int kernel_size, int group_size, int scale_factor) {\n  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,\n                            scale_factor);\n}\n\nvoid carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,\n                           Tensor bottom_grad, Tensor mask_grad,\n                           int kernel_size, int group_size, int scale_factor) {\n  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,\n                             kernel_size, group_size, scale_factor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"carafe_naive_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\n/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,\n *                                int kernel_size, int group_size,\n *                                int scale_factor)\n */\nvoid carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  int kernel_size, group_size, scale_factor;\n  SSAttrs(attr)\n      .get<int>(\"kernel_size\", kernel_size)\n      .get<int>(\"group_size\", group_size)\n      .get<int>(\"scale_factor\", scale_factor)\n      .done();\n\n  const auto& features = buildATensor(ctx, ins[0]);\n  const auto& masks = buildATensor(ctx, ins[1]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,\n                            scale_factor);\n}\n\n/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor\n * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,\n *                                int scale_factor);\n */\nvoid carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  int kernel_size, group_size, scale_factor;\n  SSAttrs(attr)\n      .get<int>(\"kernel_size\", kernel_size)\n      .get<int>(\"group_size\", group_size)\n      .get<int>(\"scale_factor\", scale_factor)\n      .done();\n\n  const auto& top_grad = buildATensor(ctx, ins[0]);\n  const auto& features = buildATensor(ctx, ins[1]);\n  const auto& masks = buildATensor(ctx, ins[2]);\n\n  auto bottom_grad = buildATensor(ctx, outs[0]);\n  auto mask_grad = buildATensor(ctx, outs[1]);\n  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,\n                             kernel_size, group_size, scale_factor);\n}\n\nPARROTS_EXTENSION_REGISTER(carafe_naive_forward)\n    .attr(\"kernel_size\")\n    .attr(\"group_size\")\n    .attr(\"scale_factor\")\n    .input(2)\n    .output(1)\n    .apply(carafe_naive_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(carafe_naive_backward)\n    .attr(\"kernel_size\")\n    .attr(\"group_size\")\n    .attr(\"scale_factor\")\n    .input(3)\n    .output(2)\n    .apply(carafe_naive_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe_naive_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_NAIVE_PYTORCH_H\n#define CARAFE_NAIVE_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor);\n\nvoid carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor);\n#endif  // CARAFE_NAIVE_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"carafe_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\n/*\n * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,\n *                          Tensor routput, Tensor rmasks, Tensor output,\n *                          int kernel_size, int group_size, int scale_factor);\n */\nvoid carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                 const OperatorBase::in_list_t& ins,\n                                 OperatorBase::out_list_t& outs) {\n  int kernel_size, group_size, scale_factor;\n  SSAttrs(attr)\n      .get<int>(\"kernel_size\", kernel_size)\n      .get<int>(\"group_size\", group_size)\n      .get<int>(\"scale_factor\", scale_factor)\n      .done();\n\n  const auto& features = buildATensor(ctx, ins[0]);\n  const auto& masks = buildATensor(ctx, ins[1]);\n\n  auto rfeatures = buildATensor(ctx, outs[0]);\n  auto routput = buildATensor(ctx, outs[1]);\n  auto rmasks = buildATensor(ctx, outs[2]);\n  auto output = buildATensor(ctx, outs[3]);\n\n  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,\n                      kernel_size, group_size, scale_factor);\n}\n\n/*\n * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,\n *                           Tensor rtop_grad, Tensor rbottom_grad_hs,\n *                           Tensor rbottom_grad, Tensor rmask_grad,\n *                           Tensor bottom_grad, Tensor mask_grad, int\n * kernel_size, int group_size, int scale_factor);\n */\nvoid carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                  const OperatorBase::in_list_t& ins,\n                                  OperatorBase::out_list_t& outs) {\n  int kernel_size, group_size, scale_factor;\n  SSAttrs(attr)\n      .get<int>(\"kernel_size\", kernel_size)\n      .get<int>(\"group_size\", group_size)\n      .get<int>(\"scale_factor\", scale_factor)\n      .done();\n\n  const auto& top_grad = buildATensor(ctx, ins[0]);\n  const auto& rfeatures = buildATensor(ctx, ins[1]);\n  const auto& masks = buildATensor(ctx, ins[2]);\n\n  auto rtop_grad = buildATensor(ctx, outs[0]);\n  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);\n  auto rbottom_grad = buildATensor(ctx, outs[2]);\n  auto rmask_grad = buildATensor(ctx, outs[3]);\n  auto bottom_grad = buildATensor(ctx, outs[4]);\n  auto mask_grad = buildATensor(ctx, outs[5]);\n\n  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,\n                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,\n                       kernel_size, group_size, scale_factor);\n}\n\nPARROTS_EXTENSION_REGISTER(carafe_forward)\n    .attr(\"kernel_size\")\n    .attr(\"group_size\")\n    .attr(\"scale_factor\")\n    .input(2)\n    .output(4)\n    .apply(carafe_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(carafe_backward)\n    .attr(\"kernel_size\")\n    .attr(\"group_size\")\n    .attr(\"scale_factor\")\n    .input(3)\n    .output(6)\n    .apply(carafe_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/carafe_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CARAFE_PYTORCH_H\n#define CARAFE_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor);\n#endif  // CARAFE_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/chamfer_distance.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2) {\n  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,\n                       idx1, idx2);\n}\n\nvoid chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2) {\n  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,\n                       graddist1, graddist2, gradxyz1, gradxyz2);\n}\n\nvoid chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,\n                              const Tensor dist1, const Tensor dist2,\n                              const Tensor idx1, const Tensor idx2) {\n  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);\n}\n\nvoid chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,\n                               Tensor idx1, Tensor idx2, Tensor graddist1,\n                               Tensor graddist2, Tensor gradxyz1,\n                               Tensor gradxyz2) {\n  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,\n                                 gradxyz1, gradxyz2);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"chamfer_distance_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid chamfer_distance_forward_cuda_parrots(CudaContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  auto xyz1 = buildATensor(ctx, ins[0]);\n  auto xyz2 = buildATensor(ctx, ins[1]);\n  auto dist1 = buildATensor(ctx, outs[0]);\n  auto dist2 = buildATensor(ctx, outs[1]);\n  auto idx1 = buildATensor(ctx, outs[2]);\n  auto idx2 = buildATensor(ctx, outs[3]);\n  chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);\n}\n\nvoid chamfer_distance_backward_cuda_parrots(CudaContext& ctx,\n                                            const SSElement& attr,\n                                            const OperatorBase::in_list_t& ins,\n                                            OperatorBase::out_list_t& outs) {\n  auto xyz1 = buildATensor(ctx, ins[0]);\n  auto xyz2 = buildATensor(ctx, ins[1]);\n  auto idx1 = buildATensor(ctx, ins[2]);\n  auto idx2 = buildATensor(ctx, ins[3]);\n  auto graddist1 = buildATensor(ctx, ins[4]);\n  auto graddist2 = buildATensor(ctx, ins[5]);\n  auto gradxyz1 = buildATensor(ctx, outs[0]);\n  auto gradxyz2 = buildATensor(ctx, outs[1]);\n  chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,\n                            gradxyz1, gradxyz2);\n}\n\nPARROTS_EXTENSION_REGISTER(chamfer_distance_forward)\n    .input(2)\n    .output(4)\n    .apply(chamfer_distance_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(chamfer_distance_backward)\n    .input(6)\n    .output(2)\n    .apply(chamfer_distance_backward_cuda_parrots)\n    .done();\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H\n#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,\n                              const Tensor dist1, const Tensor dist2,\n                              const Tensor idx1, const Tensor idx);\n\nvoid chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,\n                               Tensor idx1, Tensor idx2, Tensor graddist1,\n                               Tensor graddist2, Tensor gradxyz1,\n                               Tensor gradxyz2);\n\n#endif  // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/contour_expand.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// It is modified from https://github.com/whai362/PSENet\n#include <iostream>\n#include <queue>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nusing namespace std;\n\nclass Point2d {\n public:\n  int x;\n  int y;\n\n  Point2d() : x(0), y(0) {}\n  Point2d(int _x, int _y) : x(_x), y(_y) {}\n};\n\nvoid kernel_dilate(const uint8_t *data, IntArrayRef data_shape,\n                   const int *label_map, int &label_num, int &min_area,\n                   vector<vector<int>> &text_line) {\n  std::vector<int> area(label_num + 1);\n  int kernel_num = data_shape[0];\n  int height = data_shape[1];\n  int width = data_shape[2];\n\n  for (int x = 0; x < height; ++x) {\n    for (int y = 0; y < width; ++y) {\n      int label = label_map[x * width + y];\n      if (label == 0) continue;\n      area[label] += 1;\n    }\n  }\n\n  queue<Point2d> queue, next_queue;\n  for (int x = 0; x < height; ++x) {\n    vector<int> row(width);\n    for (int y = 0; y < width; ++y) {\n      int label = label_map[x * width + y];\n      if (label == 0) continue;\n      if (area[label] < min_area) continue;\n\n      Point2d point(x, y);\n      queue.push(point);\n      row[y] = label;\n    }\n    text_line.emplace_back(row);\n  }\n\n  int dx[] = {-1, 1, 0, 0};\n  int dy[] = {0, 0, -1, 1};\n  vector<int> kernel_step(kernel_num);\n  std::for_each(kernel_step.begin(), kernel_step.end(),\n                [=](int &k) { return k * height * width; });\n\n  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {\n    while (!queue.empty()) {\n      Point2d point = queue.front();\n      queue.pop();\n      int x = point.x;\n      int y = point.y;\n      int label = text_line[x][y];\n\n      bool is_edge = true;\n      for (int d = 0; d < 4; ++d) {\n        int tmp_x = x + dx[d];\n        int tmp_y = y + dy[d];\n\n        if (tmp_x < 0 || tmp_x >= height) continue;\n        if (tmp_y < 0 || tmp_y >= width) continue;\n        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];\n        if (kernel_value == 0) continue;\n        if (text_line[tmp_x][tmp_y] > 0) continue;\n\n        Point2d point(tmp_x, tmp_y);\n        queue.push(point);\n        text_line[tmp_x][tmp_y] = label;\n        is_edge = false;\n      }\n\n      if (is_edge) {\n        next_queue.push(point);\n      }\n    }\n    swap(queue, next_queue);\n  }\n}\n\nstd::vector<std::vector<int>> contour_expand(Tensor kernel_mask,\n                                             Tensor internal_kernel_label,\n                                             int min_kernel_area,\n                                             int kernel_num) {\n  kernel_mask = kernel_mask.contiguous();\n  internal_kernel_label = internal_kernel_label.contiguous();\n  assert(kernel_mask.dim() == 3);\n  assert(internal_kernel_label.dim() == 2);\n  assert(kernel_mask.size(1) == internal_kernel_label.size(0));\n  assert(kernel_mask.size(2) == internal_kernel_label.size(1));\n  CHECK_CPU_INPUT(kernel_mask);\n  CHECK_CPU_INPUT(internal_kernel_label);\n  auto ptr_data = kernel_mask.data_ptr<uint8_t>();\n  IntArrayRef data_shape = kernel_mask.sizes();\n\n  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();\n  vector<vector<int>> text_line;\n\n  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,\n                min_kernel_area, text_line);\n\n  return text_line;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/contour_expand_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"contour_expand_pytorch.h\"\n\nusing namespace parrots;\nusing namespace std;\n\ntemplate <typename T>\nvoid contour_expand_parrots(T& ctx, const SSElement& attr,\n                            const OperatorBase::in_list_t& ins,\n                            OperatorBase::out_list_t& outs) {\n  int min_kernel_area, kernel_num;\n  SSAttrs(attr)\n      .get<int>(\"min_kernel_area\", min_kernel_area)\n      .get<int>(\"kernel_num\", kernel_num)\n      .done();\n  at::Tensor kernel_mask;\n  at::Tensor internal_kernel_label;\n  kernel_mask = buildATensor(ctx, ins[0]);\n  internal_kernel_label = buildATensor(ctx, ins[1]);\n  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,\n                            kernel_num);\n  int n = out.size(), m = 0;\n  for (int i = 0; i < n; ++i)\n    if (m < out[i].size()) m = out[i].size();\n  auto options = torch::TensorOptions().dtype(at::kInt);\n  auto tensor = torch::zeros({n, m}, options);\n  for (int i = 0; i < n; i++)\n    tensor.slice(0, i, i + 1) =\n        torch::from_blob(out[i].data(), {out[i].size()}, options);\n  updateDArray(ctx, tensor, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(contour_expand)\n    .attr(\"min_kernel_area\")\n    .attr(\"kernel_num\")\n    .input(2)\n    .output(1)\n    .apply(contour_expand_parrots<HostContext>)\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/contour_expand_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CONTOUR_EXPAND_PYTORCH_H\n#define CONTOUR_EXPAND_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nstd::vector<std::vector<int>> contour_expand(Tensor kernel_mask,\n                                             Tensor internal_kernel_label,\n                                             int min_kernel_area,\n                                             int kernel_num);\n\n#endif  // CONTOUR_EXPAND_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/convex_iou.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid convex_iou_impl(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious) {\n  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);\n}\n\nvoid convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {\n  convex_iou_impl(pointsets, polygons, ious);\n}\n\nvoid convex_giou_impl(const Tensor pointsets, const Tensor polygons,\n                      Tensor output) {\n  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);\n}\n\nvoid convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {\n  convex_giou_impl(pointsets, polygons, output);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/convex_iou_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"convex_iou_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  auto pointsets = buildATensor(ctx, ins[0]);\n  auto polygons = buildATensor(ctx, ins[1]);\n  auto ious = buildATensor(ctx, outs[0]);\n  convex_iou(pointsets, polygons, ious);\n}\n\nvoid convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  auto pointsets = buildATensor(ctx, ins[0]);\n  auto polygons = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  convex_giou(pointsets, polygons, output);\n}\n\nPARROTS_EXTENSION_REGISTER(convex_iou)\n    .input(2)\n    .output(1)\n    .apply(convex_iou_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(convex_giou)\n    .input(2)\n    .output(1)\n    .apply(convex_giou_forward_cuda_parrots)\n    .done();\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/convex_iou_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CONVEX_IOU_PYTORCH_H\n#define CONVEX_IOU_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);\n\nvoid convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);\n\n#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/correlation.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <iostream>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW) {\n  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,\n                       patchH, patchW, padH, padW, dilationH, dilationW,\n                       dilation_patchH, dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW) {\n  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,\n                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,\n                       padW, dilationH, dilationW, dilation_patchH,\n                       dilation_patchW, dH, dW);\n}\n\nvoid correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,\n                         int kW, int patchH, int patchW, int padH, int padW,\n                         int dilationH, int dilationW, int dilation_patchH,\n                         int dilation_patchW, int dH, int dW) {\n  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,\n                           padW, dilationH, dilationW, dilation_patchH,\n                           dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,\n                          Tensor grad_input1, Tensor grad_input2, int kH,\n                          int kW, int patchH, int patchW, int padH, int padW,\n                          int dilationH, int dilationW, int dilation_patchH,\n                          int dilation_patchW, int dH, int dW) {\n  correlation_backward_impl(grad_output, input1, input2, grad_input1,\n                            grad_input2, kH, kW, patchH, patchW, padH, padW,\n                            dilationH, dilationW, dilation_patchH,\n                            dilation_patchW, dH, dW);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/correlation_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"correlation_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW;\n  SSAttrs(attr)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"patchH\", patchH)\n      .get<int>(\"patchW\", patchW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilation_patchH\", dilation_patchH)\n      .get<int>(\"dilation_patchW\", dilation_patchW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"dW\", dW)\n      .done();\n\n  auto input1 = buildATensor(ctx, ins[0]);\n  auto input2 = buildATensor(ctx, ins[1]);\n\n  auto output = buildATensor(ctx, outs[0]);\n\n  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,\n                      padW, dilationH, dilationW, dilation_patchH,\n                      dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW;\n  SSAttrs(attr)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"patchH\", patchH)\n      .get<int>(\"patchW\", patchW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilation_patchH\", dilation_patchH)\n      .get<int>(\"dilation_patchW\", dilation_patchW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"dW\", dW)\n      .done();\n\n  auto grad_output = buildATensor(ctx, ins[0]);\n  auto input1 = buildATensor(ctx, ins[1]);\n  auto input2 = buildATensor(ctx, ins[2]);\n\n  auto grad_input1 = buildATensor(ctx, outs[0]);\n  auto grad_input2 = buildATensor(ctx, outs[1]);\n\n  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,\n                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,\n                       dilation_patchH, dilation_patchW, dH, dW);\n}\n#endif\n\nvoid correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW;\n  SSAttrs(attr)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"patchH\", patchH)\n      .get<int>(\"patchW\", patchW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilation_patchH\", dilation_patchH)\n      .get<int>(\"dilation_patchW\", dilation_patchW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"dW\", dW)\n      .done();\n\n  auto input1 = buildATensor(ctx, ins[0]);\n  auto input2 = buildATensor(ctx, ins[1]);\n\n  auto output = buildATensor(ctx, outs[0]);\n\n  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,\n                      padW, dilationH, dilationW, dilation_patchH,\n                      dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW;\n  SSAttrs(attr)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"patchH\", patchH)\n      .get<int>(\"patchW\", patchW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilation_patchH\", dilation_patchH)\n      .get<int>(\"dilation_patchW\", dilation_patchW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"dW\", dW)\n      .done();\n\n  auto grad_output = buildATensor(ctx, ins[0]);\n  auto input1 = buildATensor(ctx, ins[1]);\n  auto input2 = buildATensor(ctx, ins[2]);\n\n  auto grad_input1 = buildATensor(ctx, outs[0]);\n  auto grad_input2 = buildATensor(ctx, outs[1]);\n\n  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,\n                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,\n                       dilation_patchH, dilation_patchW, dH, dW);\n}\n\nPARROTS_EXTENSION_REGISTER(correlation_forward)\n    .attr(\"kH\")\n    .attr(\"kW\")\n    .attr(\"patchH\")\n    .attr(\"patchW\")\n    .attr(\"padH\")\n    .attr(\"padW\")\n    .attr(\"dilationH\")\n    .attr(\"dilationW\")\n    .attr(\"dilation_patchH\")\n    .attr(\"dilation_patchW\")\n    .attr(\"dH\")\n    .attr(\"dW\")\n    .input(2)\n    .output(1)\n    .apply(correlation_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(correlation_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(correlation_backward)\n    .attr(\"kH\")\n    .attr(\"kW\")\n    .attr(\"patchH\")\n    .attr(\"patchW\")\n    .attr(\"padH\")\n    .attr(\"padW\")\n    .attr(\"dilationH\")\n    .attr(\"dilationW\")\n    .attr(\"dilation_patchH\")\n    .attr(\"dilation_patchW\")\n    .attr(\"dH\")\n    .attr(\"dW\")\n    .input(3)\n    .output(2)\n    .apply(correlation_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(correlation_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/correlation_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef CORRELATION_PYTORCH_H\n#define CORRELATION_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,\n                         int kW, int patchH, int patchW, int padH, int padW,\n                         int dilationH, int dilationW, int dilation_patchH,\n                         int dilation_patchW, int dH, int dW);\n\nvoid correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,\n                          Tensor grad_input1, Tensor grad_input2, int kH,\n                          int kW, int patchH, int patchW, int padH, int padW,\n                          int dilationH, int dilationW, int dilation_patchH,\n                          int dilation_patchW, int dH, int dW);\n\n#endif  // CORRELATION_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/cudabind.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid AssignScoreWithKForwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& points, const Tensor& centers, const Tensor& scores,\n    const Tensor& knn_idx, Tensor& output);\n\nvoid AssignScoreWithKBackwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores);\n\nvoid assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor& points,\n                                     const Tensor& centers,\n                                     const Tensor& scores,\n                                     const Tensor& knn_idx, Tensor& output) {\n  AssignScoreWithKForwardCUDAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);\n};\n\nvoid assign_score_withk_backward_cuda(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  AssignScoreWithKBackwardCUDAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,\n      grad_points, grad_centers, grad_scores);\n};\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor& points,\n                                     const Tensor& centers,\n                                     const Tensor& scores,\n                                     const Tensor& knn_idx, Tensor& output);\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores);\n\nREGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,\n                     assign_score_withk_forward_cuda);\nREGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,\n                     assign_score_withk_backward_cuda);\n\nvoid BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,\n                                        float max_radius, int nsample,\n                                        const Tensor new_xyz, const Tensor xyz,\n                                        Tensor idx);\n\nvoid ball_query_forward_cuda(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx) {\n  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,\n                                     new_xyz, xyz, idx);\n};\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx);\nREGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);\n\nvoid BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,\n                                    Tensor ious, const int mode,\n                                    const bool aligned, const int offset);\n\nvoid bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset) {\n  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);\n\nvoid BorderAlignForwardCUDAKernelLauncher(const Tensor& input,\n                                          const Tensor& boxes, Tensor output,\n                                          Tensor argmax_idx,\n                                          const int pool_size);\n\nvoid BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,\n                                           const Tensor& boxes,\n                                           const Tensor& argmax_idx,\n                                           Tensor grad_input,\n                                           const int pool_size);\n\nvoid border_align_forward_cuda(const Tensor& input, const Tensor& boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size) {\n  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,\n                                       pool_size);\n}\n\nvoid border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,\n                                const Tensor& argmax_idx, Tensor grad_input,\n                                const int pool_size) {\n  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,\n                                        grad_input, pool_size);\n}\n\nvoid border_align_forward_impl(const Tensor& input, const Tensor& boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size);\n\nvoid border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,\n                                const Tensor& argmax_idx, Tensor grad_input,\n                                const int pool_size);\n\nREGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,\n                     border_align_forward_cuda);\nREGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,\n                     border_align_backward_cuda);\n\nvoid box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);\n\nvoid CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,\n                                     Tensor rfeatures, Tensor routput,\n                                     Tensor rmasks, Tensor output,\n                                     const int kernel_size,\n                                     const int group_size,\n                                     const int scale_factor);\n\nvoid CARAFEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,\n    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,\n    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,\n    const int kernel_size, const int group_size, const int scale_factor);\n\nvoid carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor) {\n  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,\n                                  output, kernel_size, group_size,\n                                  scale_factor);\n}\n\nvoid carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor) {\n  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,\n                                   rbottom_grad_hs, rbottom_grad, rmask_grad,\n                                   bottom_grad, mask_grad, kernel_size,\n                                   group_size, scale_factor);\n}\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);\nREGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);\n\nvoid CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,\n                                          const Tensor masks, Tensor output,\n                                          const int kernel_size,\n                                          const int group_size,\n                                          const int scale_factor);\n\nvoid CARAFENAIVEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor features, const Tensor masks,\n    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,\n    const int group_size, const int scale_factor);\n\nvoid carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor) {\n  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,\n                                       group_size, scale_factor);\n}\n\nvoid carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor) {\n  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,\n                                        mask_grad, kernel_size, group_size,\n                                        scale_factor);\n}\nvoid carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor);\n\nvoid carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,\n                     carafe_naive_forward_cuda);\nREGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,\n                     carafe_naive_backward_cuda);\n\nvoid CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,\n                                          Tensor output, int kH, int kW,\n                                          int patchH, int patchW, int padH,\n                                          int padW, int dilationH,\n                                          int dilationW, int dilation_patchH,\n                                          int dilation_patchW, int dH, int dW);\n\nvoid CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,\n                                           Tensor input2, Tensor grad_input1,\n                                           Tensor grad_input2, int kH, int kW,\n                                           int patchH, int patchW, int padH,\n                                           int padW, int dilationH,\n                                           int dilationW, int dilation_patchH,\n                                           int dilation_patchW, int dH, int dW);\n\nvoid correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW) {\n  CorrelationForwardCUDAKernelLauncher(\n      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,\n      dilationW, dilation_patchH, dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW) {\n  CorrelationBackwardCUDAKernelLauncher(\n      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,\n      patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW);\n}\n\nvoid correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW);\n\nvoid correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW);\n\nREGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);\nREGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,\n                     correlation_backward_cuda);\n\nvoid deformable_im2col_cuda(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_cuda(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_cuda(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nREGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);\nREGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);\nREGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,\n                     deformable_col2im_coord_cuda);\n\nvoid DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                            Tensor offset, Tensor output,\n                                            int pooled_height, int pooled_width,\n                                            float spatial_scale,\n                                            int sampling_ratio, float gamma);\n\nvoid DeformRoIPoolBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,\n                                         pooled_height, pooled_width,\n                                         spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma) {\n  DeformRoIPoolBackwardCUDAKernelLauncher(\n      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,\n      pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n\nREGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,\n                     deform_roi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,\n                     deform_roi_pool_backward_cuda);\n\nvoid SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                                Tensor weight,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                                Tensor weight, Tensor buff,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha) {\n  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,\n                                             gamma, alpha);\n}\n\nvoid softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha) {\n  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,\n                                             grad_input, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha);\n\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,\n                     sigmoid_focal_loss_forward_cuda);\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,\n                     sigmoid_focal_loss_backward_cuda);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,\n                     softmax_focal_loss_forward_cuda);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,\n                     softmax_focal_loss_backward_cuda);\n\nvoid FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,\n                                                    const float* dataset,\n                                                    float* temp, int* idxs);\n\nvoid FurthestPointSamplingWithDistForwardCUDAKernelLauncher(\n    int b, int n, int m, const float* dataset, float* temp, int* idxs);\n\nvoid furthest_point_sampling_forward_cuda(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m) {\n  const float* dataset = points_tensor.data_ptr<float>();\n  float* temp = temp_tensor.data_ptr<float>();\n  int* idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);\n}\n\nvoid furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m) {\n  const float* dataset = points_tensor.data_ptr<float>();\n  float* temp = temp_tensor.data_ptr<float>();\n  int* idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,\n                                                         idxs);\n}\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m);\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m);\n\nREGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,\n                     furthest_point_sampling_forward_cuda);\nREGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,\n                     furthest_point_sampling_with_dist_forward_cuda);\n\ntorch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,\n                                      const torch::Tensor& bias,\n                                      const torch::Tensor& refer, int act,\n                                      int grad, float alpha, float scale);\n\ntorch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,\n                                           const torch::Tensor& bias,\n                                           const torch::Tensor& refer, int act,\n                                           int grad, float alpha, float scale);\nREGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,\n                     fused_bias_leakyrelu_op);\n\nvoid GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           const Tensor points,\n                                           const Tensor idx, Tensor out);\n\nvoid GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                            const Tensor grad_out,\n                                            const Tensor idx,\n                                            Tensor grad_points);\n\nvoid gather_points_forward_cuda(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out) {\n  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);\n};\n\nvoid gather_points_backward_cuda(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points) {\n  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,\n                                         grad_points);\n};\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out);\n\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,\n                     gather_points_forward_cuda);\nREGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,\n                     gather_points_backward_cuda);\n\nvoid GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                          int nsample, const Tensor points,\n                                          const Tensor idx, Tensor out);\n\nvoid GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           int nsample, const Tensor grad_out,\n                                           const Tensor idx,\n                                           Tensor grad_points);\n\nvoid group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,\n                                       out);\n};\n\nvoid group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,\n                                        idx, grad_points);\n};\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out);\n\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,\n                     group_points_forward_cuda);\nREGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,\n                     group_points_backward_cuda);\n\nvoid KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,\n                                  const Tensor xyz, const Tensor new_xyz,\n                                  Tensor idx, Tensor dist2);\n\nvoid knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);\n}\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2);\nREGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);\n\nvoid MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int kernel_h,\n                                           const int kernel_w, const int pad_h,\n                                           const int pad_w);\n\nvoid MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int height,\n                                           const int width, const int channels);\n\nvoid masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)\n  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,\n                                        kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)\n  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,\n                                        width, channels);\n}\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels);\n\nREGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,\n                     masked_im2col_forward_cuda);\nREGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,\n                     masked_col2im_forward_cuda);\n\nvoid modulated_deformable_im2col_cuda(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_cuda(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_cuda(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nREGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,\n                     modulated_deformable_im2col_cuda);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,\n                     modulated_deformable_col2im_cuda);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,\n                     modulated_deformable_col2im_coord_cuda);\n\nTensor ms_deform_attn_cuda_forward(const Tensor& value,\n                                   const Tensor& spatial_shapes,\n                                   const Tensor& level_start_index,\n                                   const Tensor& sampling_loc,\n                                   const Tensor& attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_cuda_backward(\n    const Tensor& value, const Tensor& spatial_shapes,\n    const Tensor& level_start_index, const Tensor& sampling_loc,\n    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,\n    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);\n\nTensor ms_deform_attn_impl_forward(const Tensor& value,\n                                   const Tensor& spatial_shapes,\n                                   const Tensor& level_start_index,\n                                   const Tensor& sampling_loc,\n                                   const Tensor& attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor& value, const Tensor& spatial_shapes,\n    const Tensor& level_start_index, const Tensor& sampling_loc,\n    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,\n    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);\n\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,\n                     ms_deform_attn_cuda_forward);\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,\n                     ms_deform_attn_cuda_backward);\n\nTensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                             int offset);\n\nTensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\nREGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);\n\nvoid PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                                int pts_num, const Tensor boxes,\n                                                const Tensor pts,\n                                                Tensor box_idx_of_points);\n\nvoid PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                               int pts_num, const Tensor boxes,\n                                               const Tensor pts,\n                                               Tensor box_idx_of_points);\n\nvoid points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points) {\n  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,\n                                             boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points) {\n  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,\n                                            boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points);\n\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points);\nREGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,\n                     points_in_boxes_part_forward_cuda);\nREGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,\n                     points_in_boxes_all_forward_cuda);\n\nvoid PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,\n                                      Tensor output, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask);\n\nvoid PSAMaskBackwardCUDAKernelLauncher(\n    const int psa_type, const Tensor grad_output, Tensor grad_input,\n    const int num_, const int h_feature, const int w_feature, const int h_mask,\n    const int w_mask, const int half_h_mask, const int half_w_mask);\n\nvoid psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,\n                                   w_feature, h_mask, w_mask, half_h_mask,\n                                   half_w_mask);\n}\n\nvoid psamask_backward_cuda(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask) {\n  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,\n                                    h_feature, w_feature, h_mask, w_mask,\n                                    half_h_mask, half_w_mask);\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\nREGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);\nREGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);\n\nvoid ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned);\n\nvoid ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                        Tensor argmax_y, Tensor argmax_x,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, int pool_mode,\n                                        bool aligned);\n\nvoid roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignForwardCUDAKernelLauncher(\n      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,\n      spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  ROIAlignBackwardCUDAKernelLauncher(\n      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,\n      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);\n\nvoid ROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor input, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor output);\n\nvoid ROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);\n\nvoid roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = input.size(1);\n  int data_height = input.size(2);\n  int data_width = input.size(3);\n  ROIAlignRotatedForwardCUDAKernelLauncher(\n      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, output);\n}\n\nvoid roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = bottom_grad.size(1);\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  ROIAlignRotatedBackwardCUDAKernelLauncher(\n      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, bottom_grad);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\nREGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,\n                     roi_align_rotated_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,\n                     roi_align_rotated_backward_cuda);\n\nvoid RiROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor features, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor output);\n\nvoid RiROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor bottom_grad);\n\nvoid riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(features);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = features.size(1) / num_orientations;\n  int data_height = features.size(2);\n  int data_width = features.size(3);\n  RiROIAlignRotatedForwardCUDAKernelLauncher(\n      features, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, output);\n}\n\nvoid riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(top_grad);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = bottom_grad.size(1) / num_orientations;\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  RiROIAlignRotatedBackwardCUDAKernelLauncher(\n      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, bottom_grad);\n}\n\nvoid riroi_align_rotated_forward_impl(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise);\n\nvoid riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise);\n\nREGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,\n                     riroi_align_rotated_forward_cuda);\nREGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,\n                     riroi_align_rotated_backward_cuda);\n\nvoid RoiawarePool3dForwardCUDAKernelLauncher(\n    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,\n    int out_y, int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,\n    Tensor pooled_features, int pool_method);\n\nvoid RoiawarePool3dBackwardCUDAKernelLauncher(\n    int boxes_num, int out_x, int out_y, int out_z, int channels,\n    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,\n    const Tensor grad_out, Tensor grad_in, int pool_method);\n\nvoid roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method) {\n  RoiawarePool3dForwardCUDAKernelLauncher(\n      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,\n      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,\n      pool_method);\n};\n\nvoid roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method) {\n  RoiawarePool3dBackwardCUDAKernelLauncher(\n      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,\n      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);\n};\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method);\n\nREGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,\n                     roiaware_pool3d_forward_cuda);\nREGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,\n                     roiaware_pool3d_backward_cuda);\n\nvoid RoIPointPool3dForwardCUDAKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);\n\nvoid roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag) {\n  RoIPointPool3dForwardCUDAKernelLauncher(\n      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,\n      boxes3d, pts_feature, pooled_features, pooled_empty_flag);\n};\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag);\nREGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,\n                     roipoint_pool3d_forward_cuda);\n\nvoid ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax, int pooled_height,\n                                      int pooled_width, float spatial_scale);\n\nvoid ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                       Tensor argmax, Tensor grad_input,\n                                       int pooled_height, int pooled_width,\n                                       float spatial_scale);\n\nvoid roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale) {\n  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,\n                                   pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale) {\n  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,\n                                    pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\nREGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\nstd::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(\n    const at::Tensor& feats, const at::Tensor& coors,\n    const reduce_t reduce_type);\n\nvoid DynamicPointToVoxelBackwardCUDAKernelLauncher(\n    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,\n    const at::Tensor& feats, const at::Tensor& reduced_feats,\n    const at::Tensor& coors_map, const at::Tensor& reduce_count,\n    const reduce_t reduce_type);\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(\n    const torch::Tensor& feats, const torch::Tensor& coors,\n    const reduce_t reduce_type) {\n  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,\n                                                      reduce_type);\n};\n\nvoid dynamic_point_to_voxel_backward_cuda(\n    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,\n    const torch::Tensor& feats, const torch::Tensor& reduced_feats,\n    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,\n    const reduce_t reduce_type) {\n  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,\n                                                feats, reduced_feats, coors_idx,\n                                                reduce_count, reduce_type);\n};\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(\n    const torch::Tensor& feats, const torch::Tensor& coors,\n    const reduce_t reduce_type);\n\nvoid dynamic_point_to_voxel_backward_impl(\n    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,\n    const torch::Tensor& feats, const torch::Tensor& reduced_feats,\n    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,\n    const reduce_t reduce_type);\n\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,\n                     dynamic_point_to_voxel_forward_cuda);\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,\n                     dynamic_point_to_voxel_backward_cuda);\n\nvoid SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);\n\nvoid SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,\n                                        Tensor var);\n\nvoid SyncBNForwardOutputCUDAKernelLauncher(\n    const Tensor input, const Tensor mean, const Tensor var,\n    Tensor running_mean, Tensor running_var, const Tensor weight,\n    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,\n    float momentum, int group_size);\n\nvoid SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,\n                                           const Tensor norm,\n                                           Tensor grad_weight,\n                                           Tensor grad_bias);\n\nvoid SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,\n                                          const Tensor weight,\n                                          const Tensor grad_weight,\n                                          const Tensor grad_bias,\n                                          const Tensor norm, const Tensor std,\n                                          Tensor grad_input);\n\nvoid sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {\n  SyncBNForwardMeanCUDAKernelLauncher(input, mean);\n}\n\nvoid sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,\n                              Tensor var) {\n  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);\n}\n\nvoid sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size) {\n  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,\n                                        running_var, weight, bias, norm, std,\n                                        output, eps, momentum, group_size);\n}\n\nvoid sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias) {\n  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,\n                                        grad_bias);\n}\n\nvoid sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input) {\n  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,\n                                       grad_bias, norm, std, grad_input);\n}\n\nvoid sync_bn_forward_mean_impl(const Tensor input, Tensor mean);\n\nvoid sync_bn_forward_var_impl(const Tensor input, const Tensor mean,\n                              Tensor var);\n\nvoid sync_bn_forward_output_impl(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size);\n\nvoid sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias);\n\nvoid sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input);\n\nREGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,\n                     sync_bn_forward_mean_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,\n                     sync_bn_forward_output_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,\n                     sync_bn_backward_param_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,\n                     sync_bn_backward_data_cuda);\n\nvoid ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,\n                                               const Tensor points,\n                                               const Tensor idx,\n                                               const Tensor weight, Tensor out);\n\nvoid ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,\n                                                const Tensor grad_out,\n                                                const Tensor idx,\n                                                const Tensor weight,\n                                                Tensor grad_points);\n\nvoid three_interpolate_forward_cuda(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out) {\n  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,\n                                            out);\n};\n\nvoid three_interpolate_backward_cuda(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points) {\n  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,\n                                             grad_points);\n};\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out);\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points);\nREGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,\n                     three_interpolate_forward_cuda);\nREGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,\n                     three_interpolate_backward_cuda);\n\nvoid ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,\n                                      const Tensor known, Tensor dist2,\n                                      Tensor idx);\n\nvoid three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx) {\n  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);\n};\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx);\nREGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);\n\nvoid TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,\n                                       Tensor output);\n\nvoid TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,\n                                        Tensor grad_input);\n\nvoid tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {\n  TINShiftForwardCUDAKernelLauncher(input, shift, output);\n}\n\nvoid tin_shift_backward_cuda(Tensor grad_output, Tensor shift,\n                             Tensor grad_input) {\n  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input);\nREGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);\nREGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);\n\ntorch::Tensor upfirdn2d_op(const torch::Tensor& input,\n                           const torch::Tensor& kernel, int up_x, int up_y,\n                           int down_x, int down_y, int pad_x0, int pad_x1,\n                           int pad_y0, int pad_y1);\n\ntorch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,\n                                const torch::Tensor& kernel, int up_x, int up_y,\n                                int down_x, int down_y, int pad_x0, int pad_x1,\n                                int pad_y0, int pad_y1);\nREGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);\n\nint HardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,\n    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nint NondeterministicHardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,\n    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nvoid DynamicVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor& points, at::Tensor& coors,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const int NDim = 3);\n\nint hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,\n                               at::Tensor& coors,\n                               at::Tensor& num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim) {\n  return HardVoxelizeForwardCUDAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nint nondeterministic_hard_voxelize_forward_cuda(\n    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,\n    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim) {\n  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nvoid dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim) {\n  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,\n                                           coors_range, NDim);\n};\n\nint hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,\n                               at::Tensor& coors,\n                               at::Tensor& num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim);\n\nint nondeterministic_hard_voxelize_forward_impl(\n    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,\n    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim);\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim);\n\nREGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,\n                     hard_voxelize_forward_cuda);\nREGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,\n                     nondeterministic_hard_voxelize_forward_cuda);\nREGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,\n                     dynamic_voxelize_forward_cuda);\n\nvoid RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor output);\n\nvoid RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,\n                                                   const Tensor best_bboxes,\n                                                   const float spatial_scale,\n                                                   const int points,\n                                                   Tensor bottom_grad);\n\nvoid rotated_feature_align_forward_cuda(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output) {\n  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,\n                                               spatial_scale, points, output);\n};\n\nvoid rotated_feature_align_backward_cuda(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad) {\n  RotatedFeatureAlignBackwardCUDAKernelLauncher(\n      top_grad, best_bboxes, spatial_scale, points, bottom_grad);\n};\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,\n                     rotated_feature_align_forward_cuda);\nREGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,\n                     rotated_feature_align_backward_cuda);\n\nvoid PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,\n                                               const at::Tensor polygons,\n                                               const int rows, const int cols,\n                                               at::Tensor output);\n\nvoid points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols) {\n  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,\n                                            output);\n};\n\nvoid points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols);\n\nREGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,\n                     points_in_polygons_forward_cuda);\n\nvoid MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);\n\nvoid min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {\n  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);\n}\n\nvoid min_area_polygons_impl(const Tensor pointsets, Tensor polygons);\n\nREGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);\n\nvoid ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,\n                                                  const Tensor indices,\n                                                  Tensor output);\n\nvoid ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,\n                                                   const Tensor indices,\n                                                   Tensor grad_in);\n\nvoid active_rotated_filter_forward_cuda(const Tensor input,\n                                        const Tensor indices, Tensor output) {\n  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);\n};\n\nvoid active_rotated_filter_backward_cuda(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in) {\n  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);\n};\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output);\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in);\n\nREGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,\n                     active_rotated_filter_forward_cuda);\nREGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,\n                     active_rotated_filter_backward_cuda);\n\nvoid ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                 Tensor ious);\n\nvoid ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                  Tensor output);\n\nvoid convex_iou_cuda(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious) {\n  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);\n}\n\nvoid convex_giou_cuda(const Tensor pointsets, const Tensor polygons,\n                      Tensor output) {\n  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);\n}\n\nvoid convex_iou_impl(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious);\n\nvoid convex_giou_impl(const Tensor pointsets, const Tensor polygons,\n                      Tensor output);\n\nREGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);\nREGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);\n\nTensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,\n                                                    Tensor mask,\n                                                    Tensor num_valid);\n\nTensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid) {\n  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,\n                                                      num_valid);\n}\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\nREGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,\n                     diff_iou_rotated_sort_vertices_forward_cuda);\n\nvoid ChamferDistanceForwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,\n    const Tensor dist2, const Tensor idx1, const Tensor idx2);\n\nvoid ChamferDistanceBackwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,\n    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);\n\nvoid chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2) {\n  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,\n                                           idx2);\n};\n\nvoid chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2) {\n  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,\n                                            graddist2, gradxyz1, gradxyz2);\n};\n\nvoid chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2);\n\nvoid chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2);\n\nREGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,\n                     chamfer_distance_forward_cuda);\nREGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,\n                     chamfer_distance_backward_cuda);\n\nvoid PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                        Tensor output, int pooled_height,\n                                        int pooled_width, float spatial_scale);\n\nvoid PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                         Tensor grad_input, int pooled_height,\n                                         int pooled_width, float spatial_scale);\n\nvoid PrROIPoolCoorBackwardCUDAKernelLauncher(\n    Tensor output, Tensor grad_output, Tensor input, Tensor rois,\n    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);\n\nvoid prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale) {\n  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,\n                                     pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,\n                                      pooled_height, pooled_width,\n                                      spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale) {\n  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,\n                                          grad_rois, pooled_height,\n                                          pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale);\nvoid prroi_pool_backward_impl(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale);\nvoid prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale);\nREGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);\nREGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,\n                     prroi_pool_coor_backward_cuda);\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col) {\n  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,\n                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,\n                       stride_w, dilation_h, dilation_w, parallel_imgs,\n                       deformable_group, data_col);\n}\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im) {\n  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,\n                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,\n                       stride_w, dilation_h, dilation_w, parallel_imgs,\n                       deformable_group, grad_im);\n}\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset) {\n  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,\n                       data_offset, channels, height, width, ksize_h, ksize_w,\n                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,\n                       parallel_imgs, deformable_group, grad_offset);\n}\n\nvoid deform_conv_shape_check(at::Tensor input, at::Tensor offset,\n                             at::Tensor *gradOutput, at::Tensor weight, int kH,\n                             int kW, int dH, int dW, int padH, int padW,\n                             int dilationH, int dilationW, int group,\n                             int deformable_group) {\n  TORCH_CHECK(\n      weight.ndimension() == 4,\n      \"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s\",\n      weight.ndimension());\n\n  TORCH_CHECK(weight.is_contiguous(), \"weight tensor has to be contiguous\");\n\n  TORCH_CHECK(kW > 0 && kH > 0,\n              \"kernel size should be greater than zero, but got kH: %d kW: %d\",\n              kH, kW);\n\n  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),\n              \"kernel size should be consistent with weight, \",\n              \"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d\",\n              kH, kW, weight.size(2), weight.size(3));\n\n  TORCH_CHECK(dW > 0 && dH > 0,\n              \"stride should be greater than zero, but got dH: %d dW: %d\", dH,\n              dW);\n\n  TORCH_CHECK(\n      dilationW > 0 && dilationH > 0,\n      \"dilation should be greater than 0, but got dilationH: %d dilationW: %d\",\n      dilationH, dilationW);\n\n  int ndim = input.ndimension();\n  int dimf = 0;\n  int dimh = 1;\n  int dimw = 2;\n\n  if (ndim == 4) {\n    dimf++;\n    dimh++;\n    dimw++;\n  }\n\n  TORCH_CHECK(ndim == 3 || ndim == 4,\n              \"3D or 4D input tensor expected but got: %s\", ndim);\n\n  long nInputPlane = weight.size(1) * group;\n  long inputHeight = input.size(dimh);\n  long inputWidth = input.size(dimw);\n  long nOutputPlane = weight.size(0);\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n\n  TORCH_CHECK(nInputPlane % deformable_group == 0,\n              \"input channels must divide deformable group size\");\n\n  if (outputWidth < 1 || outputHeight < 1)\n    AT_ERROR(\n        \"Given input size: (%ld x %ld x %ld). \"\n        \"Calculated output size: (%ld x %ld x %ld). Output size is too small\",\n        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,\n        outputWidth);\n\n  TORCH_CHECK(input.size(1) == nInputPlane,\n              \"invalid number of input planes, expected: %d, but got: %d\",\n              nInputPlane, input.size(1));\n\n  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),\n              \"input image is smaller than kernel\");\n\n  TORCH_CHECK(\n      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),\n      \"invalid spatial size of offset, expected height: %d width: %d, but \"\n      \"got height: %d width: %d\",\n      outputHeight, outputWidth, offset.size(2), offset.size(3));\n\n  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),\n              \"invalid number of channels of offset\");\n\n  if (gradOutput != NULL) {\n    TORCH_CHECK(\n        gradOutput->size(dimf) == nOutputPlane,\n        \"invalid number of gradOutput planes, expected: %d, but got: %d\",\n        nOutputPlane, gradOutput->size(dimf));\n\n    TORCH_CHECK(\n        (gradOutput->size(dimh) == outputHeight &&\n         gradOutput->size(dimw) == outputWidth),\n        \"invalid size of gradOutput, expected height: %d width: %d , but \"\n        \"got height: %d width: %d\",\n        outputHeight, outputWidth, gradOutput->size(dimh),\n        gradOutput->size(dimw));\n  }\n}\n\nvoid deform_conv_forward(Tensor input, Tensor weight, Tensor offset,\n                         Tensor output, Tensor columns, Tensor ones, int kW,\n                         int kH, int dW, int dH, int padW, int padH,\n                         int dilationW, int dilationH, int group,\n                         int deformable_group, int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(weight);\n    CHECK_CUDA_INPUT(output);\n    CHECK_CUDA_INPUT(columns);\n    CHECK_CUDA_INPUT(ones);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  } else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(weight);\n    CHECK_CPU_INPUT(output);\n    CHECK_CPU_INPUT(columns);\n    CHECK_CPU_INPUT(ones);\n  }\n\n  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,\n                          padW, dilationH, dilationW, group, deformable_group);\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input.unsqueeze_(0);\n    offset.unsqueeze_(0);\n  }\n\n  // todo: assert batchsize dividable by im2col_step\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = weight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), \"invalid batch size of offset\");\n\n  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,\n                        outputHeight, outputWidth});\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {\n    ones = at::ones({outputHeight, outputWidth}, input.options());\n  }\n\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,\n                                    im2col_step * outputHeight, outputWidth},\n                                   output.options());\n\n  output_buffer = output_buffer.view(\n      {output_buffer.size(0), group, output_buffer.size(1) / group,\n       output_buffer.size(2), output_buffer.size(3)});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group, columns);\n\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      output_buffer[elt][g] = output_buffer[elt][g]\n                                  .flatten(1)\n                                  .addmm_(weight[g].flatten(1), columns[g])\n                                  .view_as(output_buffer[elt][g]);\n    }\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n  }\n\n  output_buffer = output_buffer.view(\n      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),\n       output_buffer.size(3), output_buffer.size(4)});\n\n  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,\n                                      im2col_step, outputHeight, outputWidth});\n  output_buffer.transpose_(1, 2);\n  output.copy_(output_buffer);\n  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    output = output.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});\n  }\n}\n\nvoid deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,\n                                Tensor gradInput, Tensor gradOffset,\n                                Tensor weight, Tensor columns, int kW, int kH,\n                                int dW, int dH, int padW, int padH,\n                                int dilationW, int dilationH, int group,\n                                int deformable_group, int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(gradOutput);\n    CHECK_CUDA_INPUT(gradInput);\n    CHECK_CUDA_INPUT(gradOffset);\n    CHECK_CUDA_INPUT(weight);\n    CHECK_CUDA_INPUT(columns);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  } else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(gradOutput);\n    CHECK_CPU_INPUT(gradInput);\n    CHECK_CPU_INPUT(gradOffset);\n    CHECK_CPU_INPUT(weight);\n    CHECK_CPU_INPUT(columns);\n  }\n  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,\n                          padH, padW, dilationH, dilationW, group,\n                          deformable_group);\n\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input = input.view({1, input.size(0), input.size(1), input.size(2)});\n    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});\n    gradOutput = gradOutput.view(\n        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});\n  }\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = weight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), 3, \"invalid batch size of offset\");\n  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  // change order of grad output\n  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,\n                                nOutputPlane, outputHeight, outputWidth});\n  gradOutput.transpose_(1, 2);\n\n  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                              inputHeight, inputWidth});\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,\n                                deformable_group * 2 * kH * kW, outputHeight,\n                                outputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    // divide into groups\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n    gradOutput = gradOutput.view(\n        {gradOutput.size(0), group, gradOutput.size(1) / group,\n         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});\n\n    for (int g = 0; g < group; g++) {\n      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),\n                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    gradOutput = gradOutput.view(\n        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),\n         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});\n\n    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,\n                                 inputHeight, inputWidth, kH, kW, padH, padW,\n                                 dH, dW, dilationH, dilationW, im2col_step,\n                                 deformable_group, gradOffset[elt]);\n\n    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group,\n                           gradInput[elt]);\n\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n  }\n\n  gradOutput.transpose_(1, 2);\n  gradOutput =\n      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  gradOffset = gradOffset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});\n    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});\n    gradOffset =\n        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});\n  }\n}\n\nvoid deform_conv_backward_parameters(Tensor input, Tensor offset,\n                                     Tensor gradOutput, Tensor gradWeight,\n                                     Tensor columns, Tensor ones, int kW,\n                                     int kH, int dW, int dH, int padW, int padH,\n                                     int dilationW, int dilationH, int group,\n                                     int deformable_group, float scale,\n                                     int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(gradOutput);\n    CHECK_CUDA_INPUT(gradWeight);\n    CHECK_CUDA_INPUT(columns);\n    CHECK_CUDA_INPUT(ones);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  } else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(gradOutput);\n    CHECK_CPU_INPUT(gradWeight);\n    CHECK_CPU_INPUT(columns);\n    CHECK_CPU_INPUT(ones);\n  }\n\n  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,\n                          dW, padH, padW, dilationH, dilationW, group,\n                          deformable_group);\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input = input.view(\n        at::IntList({1, input.size(0), input.size(1), input.size(2)}));\n    gradOutput = gradOutput.view(\n        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});\n  }\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = gradWeight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), \"invalid batch size of offset\");\n\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,\n                                nOutputPlane, outputHeight, outputWidth});\n  gradOutput.transpose_(1, 2);\n\n  Tensor gradOutputBuffer = at::zeros_like(gradOutput);\n  gradOutputBuffer =\n      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,\n                             outputHeight, outputWidth});\n  gradOutputBuffer = gradOutputBuffer.contiguous();\n  gradOutputBuffer.copy_(gradOutput);\n  gradOutputBuffer =\n      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,\n                             im2col_step * outputHeight, outputWidth});\n\n  gradOutput.transpose_(1, 2);\n  gradOutput =\n      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group, columns);\n\n    // divide into group\n    gradOutputBuffer = gradOutputBuffer.view(\n        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,\n         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    gradWeight =\n        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),\n                         gradWeight.size(2), gradWeight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      gradWeight[g] = gradWeight[g]\n                          .flatten(1)\n                          .addmm_(gradOutputBuffer[elt][g].flatten(1),\n                                  columns[g].transpose(1, 0), 1.0, scale)\n                          .view_as(gradWeight[g]);\n    }\n    gradOutputBuffer = gradOutputBuffer.view(\n        {gradOutputBuffer.size(0),\n         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),\n         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),\n                                  gradWeight.size(2), gradWeight.size(3),\n                                  gradWeight.size(4)});\n  }\n\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_conv_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"deform_conv_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& offset = buildATensor(ctx, ins[2]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n  auto ones = buildATensor(ctx, outs[2]);\n\n  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,\n                      dH, padW, padH, dilationW, dilationH, group,\n                      deformable_group, im2col_step);\n}\n\nvoid deform_conv_backward_input_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& offset = buildATensor(ctx, ins[1]);\n  const auto& gradOutput = buildATensor(ctx, ins[2]);\n\n  auto gradInput = buildATensor(ctx, outs[0]);\n  auto gradOffset = buildATensor(ctx, outs[1]);\n  auto weight = buildATensor(ctx, outs[2]);\n  auto columns = buildATensor(ctx, outs[3]);\n\n  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,\n                             weight, columns, kW, kH, dW, dH, padW, padH,\n                             dilationW, dilationH, group, deformable_group,\n                             im2col_step);\n}\n\nvoid deform_conv_backward_parameters_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  float scale;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<float>(\"scale\", scale)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& offset = buildATensor(ctx, ins[1]);\n  const auto& gradOutput = buildATensor(ctx, ins[2]);\n\n  auto gradWeight = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n  auto ones = buildATensor(ctx, outs[2]);\n  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,\n                                  columns, ones, kW, kH, dW, dH, padW, padH,\n                                  dilationW, dilationH, group, deformable_group,\n                                  scale, im2col_step);\n}\n#endif\n\nvoid deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& offset = buildATensor(ctx, ins[2]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n  auto ones = buildATensor(ctx, outs[2]);\n\n  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,\n                      dH, padW, padH, dilationW, dilationH, group,\n                      deformable_group, im2col_step);\n}\n\nvoid deform_conv_backward_input_cpu_parrots(HostContext& ctx,\n                                            const SSElement& attr,\n                                            const OperatorBase::in_list_t& ins,\n                                            OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& offset = buildATensor(ctx, ins[1]);\n  const auto& gradOutput = buildATensor(ctx, ins[2]);\n\n  auto gradInput = buildATensor(ctx, outs[0]);\n  auto gradOffset = buildATensor(ctx, outs[1]);\n  auto weight = buildATensor(ctx, outs[2]);\n  auto columns = buildATensor(ctx, outs[3]);\n\n  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,\n                             weight, columns, kW, kH, dW, dH, padW, padH,\n                             dilationW, dilationH, group, deformable_group,\n                             im2col_step);\n}\n\nvoid deform_conv_backward_parameters_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,\n      im2col_step;\n  float scale;\n  SSAttrs(attr)\n      .get<int>(\"kW\", kW)\n      .get<int>(\"kH\", kH)\n      .get<int>(\"dW\", dW)\n      .get<int>(\"dH\", dH)\n      .get<int>(\"padW\", padW)\n      .get<int>(\"padH\", padH)\n      .get<int>(\"dilationW\", dilationW)\n      .get<int>(\"dilationH\", dilationH)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<float>(\"scale\", scale)\n      .get<int>(\"im2col_step\", im2col_step)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& offset = buildATensor(ctx, ins[1]);\n  const auto& gradOutput = buildATensor(ctx, ins[2]);\n\n  auto gradWeight = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n  auto ones = buildATensor(ctx, outs[2]);\n  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,\n                                  columns, ones, kW, kH, dW, dH, padW, padH,\n                                  dilationW, dilationH, group, deformable_group,\n                                  scale, im2col_step);\n}\n\nPARROTS_EXTENSION_REGISTER(deform_conv_forward)\n    .attr(\"kW\")\n    .attr(\"kH\")\n    .attr(\"dW\")\n    .attr(\"dH\")\n    .attr(\"padW\")\n    .attr(\"padH\")\n    .attr(\"dilationW\")\n    .attr(\"dilationH\")\n    .attr(\"group\")\n    .attr(\"deformable_group\")\n    .attr(\"im2col_step\")\n    .input(3)\n    .output(3)\n    .apply(deform_conv_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(deform_conv_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(deform_conv_backward_input)\n    .attr(\"kW\")\n    .attr(\"kH\")\n    .attr(\"dW\")\n    .attr(\"dH\")\n    .attr(\"padW\")\n    .attr(\"padH\")\n    .attr(\"dilationW\")\n    .attr(\"dilationH\")\n    .attr(\"group\")\n    .attr(\"deformable_group\")\n    .attr(\"im2col_step\")\n    .input(3)\n    .output(4)\n    .apply(deform_conv_backward_input_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(deform_conv_backward_input_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)\n    .attr(\"kW\")\n    .attr(\"kH\")\n    .attr(\"dW\")\n    .attr(\"dH\")\n    .attr(\"padW\")\n    .attr(\"padH\")\n    .attr(\"dilationW\")\n    .attr(\"dilationH\")\n    .attr(\"group\")\n    .attr(\"deformable_group\")\n    .attr(\"scale\")\n    .attr(\"im2col_step\")\n    .input(3)\n    .output(3)\n    .apply(deform_conv_backward_parameters_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(deform_conv_backward_parameters_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_conv_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef DEFORM_CONV_PYTORCH_H\n#define DEFORM_CONV_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid deform_conv_forward(Tensor input, Tensor weight, Tensor offset,\n                         Tensor output, Tensor columns, Tensor ones, int kW,\n                         int kH, int dW, int dH, int padW, int padH,\n                         int dilationW, int dilationH, int group,\n                         int deformable_group, int im2col_step);\n\nvoid deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,\n                                Tensor gradInput, Tensor gradOffset,\n                                Tensor weight, Tensor columns, int kW, int kH,\n                                int dW, int dH, int padW, int padH,\n                                int dilationW, int dilationH, int group,\n                                int deformable_group, int im2col_step);\n\nvoid deform_conv_backward_parameters(Tensor input, Tensor offset,\n                                     Tensor gradOutput, Tensor gradWeight,\n                                     Tensor columns, Tensor ones, int kW,\n                                     int kH, int dW, int dH, int padW, int padH,\n                                     int dilationW, int dilationH, int group,\n                                     int deformable_group, float scale,\n                                     int im2col_step);\n\n#endif  // DEFORM_CONV_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_roi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,\n                       output, pooled_height, pooled_width, spatial_scale,\n                       sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma) {\n  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,\n                       offset, grad_input, grad_offset, pooled_height,\n                       pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,\n                             Tensor output, int pooled_height, int pooled_width,\n                             float spatial_scale, int sampling_ratio,\n                             float gamma) {\n  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,\n                               pooled_width, spatial_scale, sampling_ratio,\n                               gamma);\n}\n\nvoid deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,\n                              Tensor offset, Tensor grad_input,\n                              Tensor grad_offset, int pooled_height,\n                              int pooled_width, float spatial_scale,\n                              int sampling_ratio, float gamma) {\n  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,\n                                grad_offset, pooled_height, pooled_width,\n                                spatial_scale, sampling_ratio, gamma);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"deform_roi_pool_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\n/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,\n *                                  Tensor output, int pooled_height,\n *                                  int pooled_width, float spatial_scale,\n *                                  int sampling_ratio, float gamma);\n */\nvoid deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,\n                                          const SSElement& attr,\n                                          const OperatorBase::in_list_t& ins,\n                                          OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  float gamma;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<float>(\"gamma\", gamma)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  const auto& offset = buildATensor(ctx, ins[2]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,\n                               pooled_width, spatial_scale, sampling_ratio,\n                               gamma);\n}\n\n/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,\n *                                   Tensor rois, Tensor offset,\n *                                   Tensor grad_input, Tensor grad_offset,\n *                                   int pooled_height, int pooled_width,\n *                                   float spatial_scale, int sampling_ratio,\n *                                   float gamma);\n */\nvoid deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  float gamma;\n\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<float>(\"gamma\", gamma)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& input = buildATensor(ctx, ins[1]);\n  const auto& rois = buildATensor(ctx, ins[2]);\n  const auto& offset = buildATensor(ctx, ins[3]);\n\n  auto grad_input = buildATensor(ctx, outs[0]);\n  auto grad_offset = buildATensor(ctx, outs[1]);\n\n  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,\n                                grad_offset, pooled_height, pooled_width,\n                                spatial_scale, sampling_ratio, gamma);\n}\n\nPARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"gamma\")\n    .input(3)\n    .output(1)\n    .apply(deform_roi_pool_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"gamma\")\n    .input(4)\n    .output(2)\n    .apply(deform_roi_pool_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef DEFORM_ROI_POOL_PYTORCH_H\n#define DEFORM_ROI_POOL_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n#endif  // DEFORM_ROI_POOL_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/diff_iou_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid) {\n  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,\n                              vertices, mask, num_valid);\n}\n\nTensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,\n                                              Tensor num_valid) {\n  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"diff_iou_rotated_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid diff_iou_rotated_sort_vertices_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  at::Tensor boxes, scores, dets;\n  auto vertices = buildATensor(ctx, ins[0]);\n  auto mask = buildATensor(ctx, ins[1]);\n  auto num_valid = buildATensor(ctx, ins[2]);\n  auto out =\n      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);\n  updateDArray(ctx, out, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)\n    .input(3)\n    .output(1)\n    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef DIFF_IOU_ROTATED_PYTORCH_H\n#define DIFF_IOU_ROTATED_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nTensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\n#endif  // DIFF_IOU_ROTATED_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/focal_loss.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,\n                       output, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,\n                       grad_input, gamma, alpha);\n}\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,\n                       output, gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha) {\n  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,\n                       buff, grad_input, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha) {\n  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor grad_input, float gamma, float alpha) {\n  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,\n                                   alpha);\n}\n\nvoid softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha) {\n  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor buff, Tensor grad_input, float gamma,\n                                 float alpha) {\n  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,\n                                   gamma, alpha);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/focal_loss_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"focal_loss_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  float gamma;\n  float alpha;\n  SSAttrs(attr).get<float>(\"gamma\", gamma).get<float>(\"alpha\", alpha).done();\n\n  // get inputs and outputs\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& target = buildATensor(ctx, ins[1]);\n  const auto& weight = buildATensor(ctx, ins[2]);\n\n  auto output = buildATensor(ctx, outs[0]);\n\n  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float gamma;\n  float alpha;\n  SSAttrs(attr).get<float>(\"gamma\", gamma).get<float>(\"alpha\", alpha).done();\n\n  // get inputs and outputs\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& target = buildATensor(ctx, ins[1]);\n  const auto& weight = buildATensor(ctx, ins[2]);\n\n  auto grad_input = buildATensor(ctx, outs[0]);\n\n  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,\n                                   alpha);\n}\n\nvoid softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  float gamma;\n  float alpha;\n  SSAttrs(attr).get<float>(\"gamma\", gamma).get<float>(\"alpha\", alpha).done();\n\n  // get inputs and outputs\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& target = buildATensor(ctx, ins[1]);\n  const auto& weight = buildATensor(ctx, ins[2]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float gamma;\n  float alpha;\n  SSAttrs(attr).get<float>(\"gamma\", gamma).get<float>(\"alpha\", alpha).done();\n\n  // get inputs and outputs\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& target = buildATensor(ctx, ins[1]);\n  const auto& weight = buildATensor(ctx, ins[2]);\n\n  auto buff = buildATensor(ctx, outs[0]);\n  auto grad_input = buildATensor(ctx, outs[1]);\n  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,\n                                   gamma, alpha);\n}\n\nPARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)\n    .attr(\"gamma\")\n    .attr(\"alpha\")\n    .input(3)\n    .output(1)\n    .apply(sigmoid_focal_loss_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)\n    .attr(\"gamma\")\n    .attr(\"alpha\")\n    .input(3)\n    .output(1)\n    .apply(sigmoid_focal_loss_backward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)\n    .attr(\"gamma\")\n    .attr(\"alpha\")\n    .input(3)\n    .output(1)\n    .apply(softmax_focal_loss_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)\n    .attr(\"gamma\")\n    .attr(\"alpha\")\n    .input(3)\n    .output(2)\n    .apply(softmax_focal_loss_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/focal_loss_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef FOCAL_LOSS_PYTORCH_H\n#define FOCAL_LOSS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nvoid softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid softmax_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha);\n#endif  // FOCAL_LOSS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/furthest_point_sample.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m) {\n  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,\n                       temp_tensor, idx_tensor, b, n, m);\n}\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m) {\n  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,\n                       points_tensor, temp_tensor, idx_tensor, b, n, m);\n}\n\nvoid furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,\n                                     Tensor idx_tensor, int b, int n, int m) {\n  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,\n                                       b, n, m);\n}\n\nvoid furthest_point_sampling_with_dist_forward(Tensor points_tensor,\n                                               Tensor temp_tensor,\n                                               Tensor idx_tensor, int b, int n,\n                                               int m) {\n  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,\n                                                 idx_tensor, b, n, m);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"furthest_point_sample_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid furthest_point_sample_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int b, n, m;\n  SSAttrs(attr).get<int>(\"b\", b).get<int>(\"n\", n).get<int>(\"m\", m).done();\n\n  auto points_tensor = buildATensor(ctx, ins[0]);\n  auto temp_tensor = buildATensor(ctx, ins[1]);\n\n  auto idx_tensor = buildATensor(ctx, outs[0]);\n\n  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,\n                                  m);\n}\n\nvoid furthest_point_sampling_with_dist_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int b, n, m;\n  SSAttrs(attr).get<int>(\"b\", b).get<int>(\"n\", n).get<int>(\"m\", m).done();\n\n  auto points_tensor = buildATensor(ctx, ins[0]);\n  auto temp_tensor = buildATensor(ctx, ins[1]);\n\n  auto idx_tensor = buildATensor(ctx, outs[0]);\n\n  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,\n                                            idx_tensor, b, n, m);\n}\nPARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)\n    .attr(\"b\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .input(2)\n    .output(1)\n    .apply(furthest_point_sample_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)\n    .attr(\"b\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .input(2)\n    .output(1)\n    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H\n#define FURTHEST_POINT_SAMPLE_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,\n                                     Tensor idx_tensor, int b, int n, int m);\n\nvoid furthest_point_sampling_with_dist_forward(Tensor points_tensor,\n                                               Tensor temp_tensor,\n                                               Tensor idx_tensor, int b, int n,\n                                               int m);\n#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp\n\n/*\nCopyright (c) 2021, NVIDIA Corporation. All rights reserved.\n\nNVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator\nAugmentation (ADA)\n=======================================================================\n\n1. Definitions\n\n\"Licensor\" means any person or entity that distributes its Work.\n\n\"Software\" means the original work of authorship made available under\nthis License.\n\n\"Work\" means the Software and any additions to or derivative works of\nthe Software that are made available under this License.\n\nThe terms \"reproduce,\" \"reproduction,\" \"derivative works,\" and\n\"distribution\" have the meaning as provided under U.S. copyright law;\nprovided, however, that for the purposes of this License, derivative\nworks shall not include works that remain separable from, or merely\nlink (or bind by name) to the interfaces of, the Work.\n\nWorks, including the Software, are \"made available\" under this License\nby including in or with the Work either (a) a copyright notice\nreferencing the applicability of this License to the Work, or (b) a\ncopy of this License.\n\n2. License Grants\n\n    2.1 Copyright Grant. Subject to the terms and conditions of this\n    License, each Licensor grants to you a perpetual, worldwide,\n    non-exclusive, royalty-free, copyright license to reproduce,\n    prepare derivative works of, publicly display, publicly perform,\n    sublicense and distribute its Work and any resulting derivative\n    works in any form.\n\n3. Limitations\n\n    3.1 Redistribution. You may reproduce or distribute the Work only\n    if (a) you do so under this License, (b) you include a complete\n    copy of this License with your distribution, and (c) you retain\n    without modification any copyright, patent, trademark, or\n    attribution notices that are present in the Work.\n\n    3.2 Derivative Works. You may specify that additional or different\n    terms apply to the use, reproduction, and distribution of your\n    derivative works of the Work (\"Your Terms\") only if (a) Your Terms\n    provide that the use limitation in Section 3.3 applies to your\n    derivative works, and (b) you identify the specific derivative\n    works that are subject to Your Terms. Notwithstanding Your Terms,\n    this License (including the redistribution requirements in Section\n    3.1) will continue to apply to the Work itself.\n\n    3.3 Use Limitation. The Work and any derivative works thereof only\n    may be used or intended for use non-commercially. Notwithstanding\n    the foregoing, NVIDIA and its affiliates may use the Work and any\n    derivative works commercially. As used herein, \"non-commercially\"\n    means for research or evaluation purposes only.\n\n    3.4 Patent Claims. If you bring or threaten to bring a patent claim\n    against any Licensor (including any claim, cross-claim or\n    counterclaim in a lawsuit) to enforce any patents that you allege\n    are infringed by any Work, then your rights under this License from\n    such Licensor (including the grant in Section 2.1) will terminate\n    immediately.\n\n    3.5 Trademarks. This License does not grant any rights to use any\n    Licensor’s or its affiliates’ names, logos, or trademarks, except\n    as necessary to reproduce the notices described in this License.\n\n    3.6 Termination. If you violate any term of this License, then your\n    rights under this License (including the grant in Section 2.1) will\n    terminate immediately.\n\n4. Disclaimer of Warranty.\n\nTHE WORK IS PROVIDED \"AS IS\" WITHOUT WARRANTIES OR CONDITIONS OF ANY\nKIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR\nNON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER\nTHIS LICENSE.\n\n5. Limitation of Liability.\n\nEXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL\nTHEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE\nSHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,\nINDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF\nOR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK\n(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,\nLOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER\nCOMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF\nTHE POSSIBILITY OF SUCH DAMAGES.\n\n=======================================================================\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,\n                                           const torch::Tensor& bias,\n                                           const torch::Tensor& refer, int act,\n                                           int grad, float alpha, float scale) {\n  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,\n                              act, grad, alpha, scale);\n}\n\ntorch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,\n                                   const torch::Tensor& bias,\n                                   const torch::Tensor& refer, int act,\n                                   int grad, float alpha, float scale) {\n  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,\n                                      scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/fused_bias_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <torch/extension.h>\n\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\nusing namespace at;\nusing namespace parrots;\n\ntorch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,\n                                   const torch::Tensor &bias,\n                                   const torch::Tensor &refer, int act,\n                                   int grad, float alpha, float scale);\n\nvoid fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,\n                                  const OperatorBase::in_list_t &ins,\n                                  OperatorBase::out_list_t &outs) {\n  int act, grad;\n  float alpha, scale;\n  SSAttrs(attr)\n      .get<int>(\"act\", act)\n      .get<int>(\"grad\", grad)\n      .get<float>(\"alpha\", alpha)\n      .get<float>(\"scale\", scale)\n      .done();\n  const auto &input = buildATensor(ctx, ins[0]);\n  const auto &bias = buildATensor(ctx, ins[1]);\n  const auto &refer = buildATensor(ctx, ins[2]);\n  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);\n  updateDArray(ctx, out, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)\n    .attr(\"act\")\n    .attr(\"grad\")\n    .attr(\"alpha\")\n    .attr(\"scale\")\n    .input(3)\n    .output(1)\n    .apply(fused_bias_leakyrelu_parrots)\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/gather_points.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out) {\n  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,\n                       idx, out);\n}\n\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,\n                       idx, grad_points);\n}\n\nvoid gather_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                           Tensor out_tensor, int b, int c, int n,\n                           int npoints) {\n  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,\n                             out_tensor);\n}\n\nvoid gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                            Tensor grad_points_tensor, int b, int c, int n,\n                            int npoints) {\n  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,\n                              grad_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/gather_points_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"gather_points_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  int b, c, n, npoints;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"n\", n)\n      .get<int>(\"npoints\", npoints)\n      .done();\n\n  auto points_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n\n  auto out_tensor = buildATensor(ctx, outs[0]);\n\n  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,\n                        npoints);\n}\n\nvoid gather_points_backward_cuda_parrots(CudaContext& ctx,\n                                         const SSElement& attr,\n                                         const OperatorBase::in_list_t& ins,\n                                         OperatorBase::out_list_t& outs) {\n  int b, c, n, npoints;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"n\", n)\n      .get<int>(\"npoints\", npoints)\n      .done();\n\n  auto grad_out_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n\n  auto grad_points_tensor = buildATensor(ctx, outs[0]);\n\n  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,\n                         n, npoints);\n}\n\nPARROTS_EXTENSION_REGISTER(gather_points_forward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"n\")\n    .attr(\"npoints\")\n    .input(2)\n    .output(1)\n    .apply(gather_points_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(gather_points_backward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"n\")\n    .attr(\"npoints\")\n    .input(2)\n    .output(1)\n    .apply(gather_points_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/gather_points_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef GATHER_POINTS_PYTORCH_H\n#define GATHER_POINTS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid gather_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                           Tensor out_tensor, int b, int c, int n, int npoints);\n\nvoid gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                            Tensor grad_points_tensor, int b, int c, int n,\n                            int npoints);\n#endif  // GATHER_POINTS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/group_points.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,\n                       points, idx, out);\n}\n\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,\n                       grad_out, idx, grad_points);\n}\n\nvoid group_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                          Tensor out_tensor, int b, int c, int n, int npoints,\n                          int nsample) {\n  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,\n                       points_tensor, idx_tensor, out_tensor);\n}\n\nvoid group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                           Tensor grad_points_tensor, int b, int c, int n,\n                           int npoints, int nsample) {\n  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,\n                             idx_tensor, grad_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/group_points_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"group_points_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  int b, c, n, npoints, nsample;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"n\", n)\n      .get<int>(\"npoints\", npoints)\n      .get<int>(\"nsample\", nsample)\n      .done();\n  auto points_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n\n  auto out_tensor = buildATensor(ctx, outs[0]);\n\n  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,\n                       nsample);\n}\n\nvoid group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  int b, c, n, npoints, nsample;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"n\", n)\n      .get<int>(\"npoints\", npoints)\n      .get<int>(\"nsample\", nsample)\n      .done();\n  auto grad_out_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n\n  auto grad_points_tensor = buildATensor(ctx, outs[0]);\n\n  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,\n                        n, npoints, nsample);\n}\n\nPARROTS_EXTENSION_REGISTER(group_points_forward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"n\")\n    .attr(\"npoints\")\n    .attr(\"nsample\")\n    .input(2)\n    .output(1)\n    .apply(group_points_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(group_points_backward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"n\")\n    .attr(\"npoints\")\n    .attr(\"nsample\")\n    .input(2)\n    .output(1)\n    .apply(group_points_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/group_points_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef GROUP_POINTS_PYTORCH_H\n#define GROUP_POINTS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid group_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                          Tensor out_tensor, int b, int c, int n, int npoints,\n                          int nsample);\n\nvoid group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                           Tensor grad_points_tensor, int b, int c, int n,\n                           int npoints, int nsample);\n\n#endif  // GROUP_POINTS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/info.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp\n#include \"pytorch_cpp_helper.hpp\"\n\n#ifdef MMCV_WITH_CUDA\n#ifdef MMCV_WITH_HIP\n#include <hip/hip_runtime_api.h>\nint get_hiprt_version() {\n  int runtimeVersion;\n  hipRuntimeGetVersion(&runtimeVersion);\n  return runtimeVersion;\n}\n#else\n#include <cuda_runtime_api.h>\nint get_cudart_version() { return CUDART_VERSION; }\n#endif\n#endif\n\nstd::string get_compiling_cuda_version() {\n#ifdef MMCV_WITH_CUDA\n#ifndef MMCV_WITH_HIP\n  std::ostringstream oss;\n  // copied from\n  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231\n  auto printCudaStyleVersion = [&](int v) {\n    oss << (v / 1000) << \".\" << (v / 10 % 100);\n    if (v % 10 != 0) {\n      oss << \".\" << (v % 10);\n    }\n  };\n  printCudaStyleVersion(get_cudart_version());\n  return oss.str();\n#else\n  std::ostringstream oss;\n  oss << get_hiprt_version();\n  return oss.str();\n#endif\n#else\n  return std::string(\"not available\");\n#endif\n}\n\n// similar to\n// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp\nstd::string get_compiler_version() {\n  std::ostringstream ss;\n#if defined(__GNUC__)\n#ifndef __clang__\n  { ss << \"GCC \" << __GNUC__ << \".\" << __GNUC_MINOR__; }\n#endif\n#endif\n\n#if defined(__clang_major__)\n  {\n    ss << \"clang \" << __clang_major__ << \".\" << __clang_minor__ << \".\"\n       << __clang_patchlevel__;\n  }\n#endif\n\n#if defined(_MSC_VER)\n  { ss << \"MSVC \" << _MSC_FULL_VER; }\n#endif\n  return ss.str();\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/iou3d.cpp",
    "content": "// Modified from\n// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp\n\n/*\n3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)\nWritten by Shaoshuai Shi\nAll Rights Reserved 2019-2020.\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nconst int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;\n\nvoid iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap) {\n  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,\n                       num_b, boxes_b, ans_overlap);\n}\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh) {\n  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,\n                       nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh) {\n  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,\n                       nms_overlap_thresh);\n}\n\nvoid iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,\n                                     Tensor ans_overlap) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params boxes_b: (M, 5)\n  // params ans_overlap: (N, M)\n  int num_a = boxes_a.size(0);\n  int num_b = boxes_b.size(0);\n\n  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,\n                                       ans_overlap);\n}\n\nvoid iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                         float nms_overlap_thresh) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params keep: (N)\n  CHECK_CONTIGUOUS(boxes);\n  CHECK_CONTIGUOUS(keep);\n\n  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                                float nms_overlap_thresh) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params keep: (N)\n\n  CHECK_CONTIGUOUS(boxes);\n  CHECK_CONTIGUOUS(keep);\n\n  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/iou3d_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"iou3d_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid iou3d_boxes_overlap_bev_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto boxes_a = buildATensor(ctx, ins[0]);\n  auto boxes_b = buildATensor(ctx, ins[1]);\n\n  auto ans_iou = buildATensor(ctx, outs[0]);\n\n  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);\n}\n\nvoid iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  float nms_overlap_thresh;\n  SSAttrs(attr).get<float>(\"nms_overlap_thresh\", nms_overlap_thresh).done();\n\n  auto boxes = buildATensor(ctx, ins[0]);\n\n  auto keep = buildATensor(ctx, outs[0]);\n  auto keep_num = buildATensor(ctx, outs[1]);\n\n  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  float nms_overlap_thresh;\n  SSAttrs(attr).get<float>(\"nms_overlap_thresh\", nms_overlap_thresh).done();\n\n  auto boxes = buildATensor(ctx, ins[0]);\n\n  auto keep = buildATensor(ctx, outs[0]);\n  auto keep_num = buildATensor(ctx, outs[1]);\n\n  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);\n}\n\nPARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)\n    .input(2)\n    .output(1)\n    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)\n    .attr(\"nms_overlap_thresh\")\n    .input(1)\n    .output(2)\n    .apply(iou3d_nms3d_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)\n    .attr(\"nms_overlap_thresh\")\n    .input(1)\n    .output(2)\n    .apply(iou3d_nms3d_normal_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/iou3d_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef IOU_3D_PYTORCH_H\n#define IOU_3D_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,\n                                     Tensor ans_overlap);\n\nvoid iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                         float nms_overlap_thresh);\n\nvoid iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                                float nms_overlap_thresh);\n\n#endif  // IOU_3D_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/knn.cpp",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,\n                       dist2);\n}\n\nvoid knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,\n                 Tensor dist2_tensor, int b, int n, int m, int nsample) {\n  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,\n                   dist2_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/knn_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"knn_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                              const OperatorBase::in_list_t& ins,\n                              OperatorBase::out_list_t& outs) {\n  int b, n, m, nsample;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"n\", n)\n      .get<int>(\"m\", m)\n      .get<int>(\"nsample\", nsample)\n      .done();\n\n  auto xyz_tensor = buildATensor(ctx, ins[0]);\n  auto new_xyz_tensor = buildATensor(ctx, ins[1]);\n\n  auto idx_tensor = buildATensor(ctx, outs[0]);\n  auto dist2_tensor = buildATensor(ctx, outs[1]);\n\n  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,\n              nsample);\n}\n\nPARROTS_EXTENSION_REGISTER(knn_forward)\n    .attr(\"b\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .attr(\"nsample\")\n    .input(2)\n    .output(2)\n    .apply(knn_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/knn_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef KNN_PYTORCH_H\n#define KNN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,\n                 Tensor dist2_tensor, int b, int n, int m, int nsample);\n#endif  // KNN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/masked_conv2d.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w) {\n  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,\n                       col, kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels) {\n  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,\n                       im, height, width, channels);\n}\n\nvoid masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor col,\n                           const int kernel_h, const int kernel_w,\n                           const int pad_h, const int pad_w) {\n  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,\n                             kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor im, int height,\n                           int width, int channels) {\n  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,\n                             channels);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"masked_conv2d_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)\n  int kernel_h, kernel_w, pad_h, pad_w;\n  SSAttrs(attr)\n      .get<int>(\"kernel_h\", kernel_h)\n      .get<int>(\"kernel_w\", kernel_w)\n      .get<int>(\"pad_h\", pad_h)\n      .get<int>(\"pad_w\", pad_w)\n      .done();\n\n  const auto& im = buildATensor(ctx, ins[0]);\n  const auto& mask_h_idx = buildATensor(ctx, ins[1]);\n  const auto& mask_w_idx = buildATensor(ctx, ins[2]);\n\n  auto col = buildATensor(ctx, outs[0]);\n  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,\n                             kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)\n  int height, width, channels;\n  SSAttrs(attr)\n      .get<int>(\"height\", height)\n      .get<int>(\"width\", width)\n      .get<int>(\"channels\", channels)\n      .done();\n\n  const auto& col = buildATensor(ctx, ins[0]);\n  const auto& mask_h_idx = buildATensor(ctx, ins[1]);\n  const auto& mask_w_idx = buildATensor(ctx, ins[2]);\n\n  auto im = buildATensor(ctx, outs[0]);\n  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,\n                             channels);\n}\n\nPARROTS_EXTENSION_REGISTER(masked_im2col_forward)\n    .attr(\"kernel_h\")\n    .attr(\"kernel_w\")\n    .attr(\"pad_h\")\n    .attr(\"pad_w\")\n    .input(3)\n    .output(1)\n    .apply(masked_im2col_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(masked_col2im_forward)\n    .attr(\"height\")\n    .attr(\"width\")\n    .attr(\"channels\")\n    .input(3)\n    .output(1)\n    .apply(masked_col2im_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MASKED_CONV2D_PYTORCH_H\n#define MASKED_CONV2D_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels);\n#endif  // MASKED_CONV2D_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/min_area_polygons.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {\n  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);\n}\n\nvoid min_area_polygons(const Tensor pointsets, Tensor polygons) {\n  min_area_polygons_impl(pointsets, polygons);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"min_area_polygons_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                    const OperatorBase::in_list_t& ins,\n                                    OperatorBase::out_list_t& outs) {\n  auto pointsets = buildATensor(ctx, ins[0]);\n\n  auto polygons = buildATensor(ctx, outs[0]);\n  min_area_polygons(pointsets, polygons);\n}\n\nPARROTS_EXTENSION_REGISTER(min_area_polygons)\n    .input(1)\n    .output(1)\n    .apply(min_area_polygons_cuda_parrots)\n    .done();\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MIN_AREA_POLYGONS_PYTORCH_H\n#define MIN_AREA_POLYGONS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid min_area_polygons(const Tensor pointsets, Tensor polygons);\n\n#endif  // MIN_AREA_POLYGONS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/modulated_deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,\n                       data_mask, batch_size, channels, height_im, width_im,\n                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,\n                       stride_h, stride_w, dilation_h, dilation_w,\n                       deformable_group, data_col);\n}\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,\n                       data_mask, batch_size, channels, height_im, width_im,\n                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,\n                       stride_h, stride_w, dilation_h, dilation_w,\n                       deformable_group, grad_im);\n}\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,\n                       data_im, data_offset, data_mask, batch_size, channels,\n                       height_im, width_im, height_col, width_col, kernel_h,\n                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,\n                       dilation_w, deformable_group, grad_offset, grad_mask);\n}\n\nvoid modulated_deform_conv_forward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias) {\n  at::DeviceGuard guard(input.device());\n\n  const int batch = input.size(0);\n  const int channels = input.size(1);\n  const int height = input.size(2);\n  const int width = input.size(3);\n\n  const int channels_out = weight.size(0);\n  const int channels_kernel = weight.size(1);\n  const int kernel_h_ = weight.size(2);\n  const int kernel_w_ = weight.size(3);\n\n  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)\n    AT_ERROR(\"Input shape and kernel shape won't match: (%d x %d vs %d x %d).\",\n             kernel_h_, kernel_w, kernel_h_, kernel_w_);\n  if (channels != channels_kernel * group)\n    AT_ERROR(\"Input shape and kernel channels won't match: (%d vs %d).\",\n             channels, channels_kernel * group);\n\n  const int height_out =\n      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;\n  const int width_out =\n      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < height_out * width_out) {\n    // Resize plane and fill with ones...\n    ones = at::ones({height_out, width_out}, input.options());\n  }\n\n  // resize output\n  output = output.view({batch, channels_out, height_out, width_out}).zero_();\n  // resize temporary columns\n  columns =\n      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},\n                input.options());\n\n  output = output.view({output.size(0), group, output.size(1) / group,\n                        output.size(2), output.size(3)});\n\n  for (int b = 0; b < batch; b++) {\n    modulated_deformable_im2col_impl(\n        input[b], offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, columns);\n\n    // divide into group\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n\n    for (int g = 0; g < group; g++) {\n      output[b][g] = output[b][g]\n                         .flatten(1)\n                         .addmm_(weight[g].flatten(1), columns[g])\n                         .view_as(output[b][g]);\n    }\n\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n  }\n\n  output = output.view({output.size(0), output.size(1) * output.size(2),\n                        output.size(3), output.size(4)});\n\n  if (with_bias) {\n    output += bias.view({1, bias.size(0), 1, 1});\n  }\n}\n\nvoid modulated_deform_conv_backward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias) {\n  at::DeviceGuard guard(input.device());\n\n  const int batch = input.size(0);\n  const int channels = input.size(1);\n  const int height = input.size(2);\n  const int width = input.size(3);\n\n  const int channels_kernel = weight.size(1);\n  const int kernel_h_ = weight.size(2);\n  const int kernel_w_ = weight.size(3);\n  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)\n    AT_ERROR(\"Input shape and kernel shape won't match: (%d x %d vs %d x %d).\",\n             kernel_h_, kernel_w, kernel_h_, kernel_w_);\n  if (channels != channels_kernel * group)\n    AT_ERROR(\"Input shape and kernel channels won't match: (%d vs %d).\",\n             channels, channels_kernel * group);\n\n  const int height_out =\n      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;\n  const int width_out =\n      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < height_out * width_out) {\n    // Resize plane and fill with ones...\n    ones = at::ones({height_out, width_out}, input.options());\n  }\n\n  grad_input = grad_input.view({batch, channels, height, width});\n  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},\n                      input.options());\n\n  grad_output =\n      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,\n                        grad_output.size(2), grad_output.size(3)});\n\n  for (int b = 0; b < batch; b++) {\n    // divide int group\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),\n                        grad_output[b][g].flatten(1), 0.0f, 1.0f);\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n\n    // gradient w.r.t. input coordinate data\n    modulated_deformable_col2im_coord_impl(\n        columns, input[b], offset[b], mask[b], 1, channels, height, width,\n        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,\n        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],\n        grad_mask[b]);\n    // gradient w.r.t. input data\n    modulated_deformable_col2im_impl(\n        columns, offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, grad_input[b]);\n\n    // gradient w.r.t. weight, dWeight should accumulate across the batch and\n    // group\n    modulated_deformable_im2col_impl(\n        input[b], offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, columns);\n\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,\n                                    grad_weight.size(1), grad_weight.size(2),\n                                    grad_weight.size(3)});\n    if (with_bias)\n      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});\n\n    for (int g = 0; g < group; g++) {\n      grad_weight[g] =\n          grad_weight[g]\n              .flatten(1)\n              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))\n              .view_as(grad_weight[g]);\n      if (with_bias) {\n        grad_bias[g] =\n            grad_bias[g]\n                .view({-1, 1})\n                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))\n                .view(-1);\n      }\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),\n                                    grad_weight.size(2), grad_weight.size(3),\n                                    grad_weight.size(4)});\n    if (with_bias)\n      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});\n  }\n  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),\n                                  grad_output.size(2), grad_output.size(3),\n                                  grad_output.size(4)});\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"modulated_deform_conv_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid modulated_deform_conv_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,\n      dilation_w, group, deformable_group, with_bias;\n  SSAttrs(attr)\n      .get<int>(\"kernel_h\", kernel_h)\n      .get<int>(\"kernel_w\", kernel_w)\n      .get<int>(\"stride_h\", stride_h)\n      .get<int>(\"stride_w\", stride_w)\n      .get<int>(\"pad_h\", pad_h)\n      .get<int>(\"pad_w\", pad_w)\n      .get<int>(\"dilation_h\", dilation_h)\n      .get<int>(\"dilation_w\", dilation_w)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"with_bias\", with_bias)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& bias = buildATensor(ctx, ins[2]);\n  const auto& ones = buildATensor(ctx, ins[3]);\n  const auto& offset = buildATensor(ctx, ins[4]);\n  const auto& mask = buildATensor(ctx, ins[5]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n\n  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,\n                                columns, kernel_h, kernel_w, stride_h, stride_w,\n                                pad_h, pad_w, dilation_h, dilation_w, group,\n                                deformable_group, with_bias);\n}\n\nvoid modulated_deform_conv_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,\n      dilation_w, group, deformable_group, with_bias;\n  SSAttrs(attr)\n      .get<int>(\"kernel_h\", kernel_h)\n      .get<int>(\"kernel_w\", kernel_w)\n      .get<int>(\"stride_h\", stride_h)\n      .get<int>(\"stride_w\", stride_w)\n      .get<int>(\"pad_h\", pad_h)\n      .get<int>(\"pad_w\", pad_w)\n      .get<int>(\"dilation_h\", dilation_h)\n      .get<int>(\"dilation_w\", dilation_w)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"with_bias\", with_bias)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& bias = buildATensor(ctx, ins[2]);\n  const auto& ones = buildATensor(ctx, ins[3]);\n  const auto& offset = buildATensor(ctx, ins[4]);\n  const auto& mask = buildATensor(ctx, ins[5]);\n\n  auto columns = buildATensor(ctx, outs[0]);\n  auto grad_input = buildATensor(ctx, outs[1]);\n  auto grad_weight = buildATensor(ctx, outs[2]);\n  auto grad_bias = buildATensor(ctx, outs[3]);\n  auto grad_offset = buildATensor(ctx, outs[4]);\n  auto grad_mask = buildATensor(ctx, outs[5]);\n  auto grad_output = buildATensor(ctx, outs[6]);\n  modulated_deform_conv_backward(\n      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,\n      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,\n      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n}\n#endif\n\nvoid modulated_deform_conv_forward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,\n      dilation_w, group, deformable_group, with_bias;\n  SSAttrs(attr)\n      .get<int>(\"kernel_h\", kernel_h)\n      .get<int>(\"kernel_w\", kernel_w)\n      .get<int>(\"stride_h\", stride_h)\n      .get<int>(\"stride_w\", stride_w)\n      .get<int>(\"pad_h\", pad_h)\n      .get<int>(\"pad_w\", pad_w)\n      .get<int>(\"dilation_h\", dilation_h)\n      .get<int>(\"dilation_w\", dilation_w)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"with_bias\", with_bias)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& bias = buildATensor(ctx, ins[2]);\n  const auto& ones = buildATensor(ctx, ins[3]);\n  const auto& offset = buildATensor(ctx, ins[4]);\n  const auto& mask = buildATensor(ctx, ins[5]);\n\n  auto output = buildATensor(ctx, outs[0]);\n  auto columns = buildATensor(ctx, outs[1]);\n\n  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,\n                                columns, kernel_h, kernel_w, stride_h, stride_w,\n                                pad_h, pad_w, dilation_h, dilation_w, group,\n                                deformable_group, with_bias);\n}\n\nvoid modulated_deform_conv_backward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,\n      dilation_w, group, deformable_group, with_bias;\n  SSAttrs(attr)\n      .get<int>(\"kernel_h\", kernel_h)\n      .get<int>(\"kernel_w\", kernel_w)\n      .get<int>(\"stride_h\", stride_h)\n      .get<int>(\"stride_w\", stride_w)\n      .get<int>(\"pad_h\", pad_h)\n      .get<int>(\"pad_w\", pad_w)\n      .get<int>(\"dilation_h\", dilation_h)\n      .get<int>(\"dilation_w\", dilation_w)\n      .get<int>(\"group\", group)\n      .get<int>(\"deformable_group\", deformable_group)\n      .get<int>(\"with_bias\", with_bias)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& bias = buildATensor(ctx, ins[2]);\n  const auto& ones = buildATensor(ctx, ins[3]);\n  const auto& offset = buildATensor(ctx, ins[4]);\n  const auto& mask = buildATensor(ctx, ins[5]);\n\n  auto columns = buildATensor(ctx, outs[0]);\n  auto grad_input = buildATensor(ctx, outs[1]);\n  auto grad_weight = buildATensor(ctx, outs[2]);\n  auto grad_bias = buildATensor(ctx, outs[3]);\n  auto grad_offset = buildATensor(ctx, outs[4]);\n  auto grad_mask = buildATensor(ctx, outs[5]);\n  auto grad_output = buildATensor(ctx, outs[6]);\n  modulated_deform_conv_backward(\n      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,\n      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,\n      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n}\nPARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)\n    .attr(\"kernel_h\")\n    .attr(\"kernel_w\")\n    .attr(\"stride_h\")\n    .attr(\"stride_w\")\n    .attr(\"pad_h\")\n    .attr(\"pad_w\")\n    .attr(\"dilation_h\")\n    .attr(\"dilation_w\")\n    .attr(\"group\")\n    .attr(\"deformable_group\")\n    .attr(\"with_bias\")\n    .input(6)\n    .output(2)\n    .apply(modulated_deform_conv_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(modulated_deform_conv_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)\n    .attr(\"kernel_h\")\n    .attr(\"kernel_w\")\n    .attr(\"stride_h\")\n    .attr(\"stride_w\")\n    .attr(\"pad_h\")\n    .attr(\"pad_w\")\n    .attr(\"dilation_h\")\n    .attr(\"dilation_w\")\n    .attr(\"group\")\n    .attr(\"deformable_group\")\n    .attr(\"with_bias\")\n    .input(6)\n    .output(7)\n    .apply(modulated_deform_conv_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(modulated_deform_conv_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef MODULATED_DEFORM_CONV_PYTORCH_H\n#define MODULATED_DEFORM_CONV_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid modulated_deform_conv_forward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias);\n\nvoid modulated_deform_conv_backward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias);\n#endif  // MODULATED_DEFORM_CONV_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/ms_deform_attn.cpp",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor ms_deform_attn_impl_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step) {\n  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,\n                              spatial_shapes, level_start_index, sampling_loc,\n                              attn_weight, im2col_step);\n}\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,\n    const int im2col_step) {\n  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,\n                       level_start_index, sampling_loc, attn_weight,\n                       grad_output, grad_value, grad_sampling_loc,\n                       grad_attn_weight, im2col_step);\n}\n\nTensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,\n                              const Tensor &level_start_index,\n                              const Tensor &sampling_loc,\n                              const Tensor &attn_weight,\n                              const int im2col_step) {\n  at::DeviceGuard guard(value.device());\n  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,\n                                     sampling_loc, attn_weight, im2col_step);\n}\n\nvoid ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,\n                             const Tensor &level_start_index,\n                             const Tensor &sampling_loc,\n                             const Tensor &attn_weight,\n                             const Tensor &grad_output, Tensor &grad_value,\n                             Tensor &grad_sampling_loc,\n                             Tensor &grad_attn_weight, const int im2col_step) {\n  at::DeviceGuard guard(value.device());\n  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,\n                               sampling_loc, attn_weight, grad_output,\n                               grad_value, grad_sampling_loc, grad_attn_weight,\n                               im2col_step);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <torch/extension.h>\n\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\nusing namespace at;\nusing namespace parrots;\n\nTensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,\n                              const Tensor &level_start_index,\n                              const Tensor &sampling_loc,\n                              const Tensor &attn_weight, const int im2col_step);\n\nvoid ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,\n                             const Tensor &level_start_index,\n                             const Tensor &sampling_loc,\n                             const Tensor &attn_weight,\n                             const Tensor &grad_output, Tensor &grad_value,\n                             Tensor &grad_sampling_loc,\n                             Tensor &grad_attn_weight, const int im2col_step);\n\nvoid ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,\n                                    const OperatorBase::in_list_t &ins,\n                                    OperatorBase::out_list_t &outs) {\n  int im2col_step;\n  SSAttrs(attr).get<int>(\"im2col_step\", im2col_step).done();\n  const auto &value = buildATensor(ctx, ins[0]);\n  const auto &spatial_shapes = buildATensor(ctx, ins[1]);\n  const auto &level_start_index = buildATensor(ctx, ins[2]);\n  const auto &sampling_loc = buildATensor(ctx, ins[3]);\n  const auto &attn_weight = buildATensor(ctx, ins[4]);\n  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,\n                                    sampling_loc, attn_weight, im2col_step);\n  updateDArray(ctx, out, outs[0]);\n}\n\nvoid ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,\n                                     const OperatorBase::in_list_t &ins,\n                                     OperatorBase::out_list_t &outs) {\n  int im2col_step;\n  SSAttrs(attr).get<int>(\"im2col_step\", im2col_step).done();\n  const auto &value = buildATensor(ctx, ins[0]);\n  const auto &spatial_shapes = buildATensor(ctx, ins[1]);\n  const auto &level_start_index = buildATensor(ctx, ins[2]);\n  const auto &sampling_loc = buildATensor(ctx, ins[3]);\n  const auto &attn_weight = buildATensor(ctx, ins[4]);\n  const auto &grad_output = buildATensor(ctx, ins[5]);\n  auto grad_value = buildATensor(ctx, outs[0]);\n  auto grad_sampling_loc = buildATensor(ctx, outs[1]);\n  auto grad_attn_weight = buildATensor(ctx, outs[2]);\n  ms_deform_attn_backward(value, spatial_shapes, level_start_index,\n                          sampling_loc, attn_weight, grad_output, grad_value,\n                          grad_sampling_loc, grad_attn_weight, im2col_step);\n}\n\nPARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)\n    .attr(\"im2col_step\")\n    .input(5)\n    .output(1)\n    .apply(ms_deform_attn_forward_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)\n    .attr(\"im2col_step\")\n    .input(6)\n    .output(3)\n    .apply(ms_deform_attn_backward_parrots)\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/nms.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);\n}\n\nTensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,\n                    float iou_threshold, float sigma, float min_score,\n                    int method, int offset) {\n  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,\n                              sigma, min_score, method, offset);\n}\n\nstd::vector<std::vector<int> > nms_match_impl(Tensor dets,\n                                              float iou_threshold) {\n  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);\n}\n\nTensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return nms_impl(boxes, scores, iou_threshold, offset);\n}\n\nTensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,\n               float sigma, float min_score, int method, int offset) {\n  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,\n                      method, offset);\n}\n\nstd::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {\n  return nms_match_impl(dets, iou_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/nms_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"nms_pytorch.h\"\n\nusing namespace parrots;\n\n// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);\ntemplate <typename T>\nvoid nms_parrots(T& ctx, const SSElement& attr,\n                 const OperatorBase::in_list_t& ins,\n                 OperatorBase::out_list_t& outs) {\n  float iou_threshold;\n  int offset;\n  SSAttrs(attr)\n      .get(\"iou_threshold\", iou_threshold)\n      .get(\"offset\", offset)\n      .done();\n  at::Tensor boxes, scores;\n  boxes = buildATensor(ctx, ins[0]);\n  scores = buildATensor(ctx, ins[1]);\n  auto out = nms(boxes, scores, iou_threshold, offset);\n  updateDArray(ctx, out, outs[0]);\n}\n\n/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,\n *                float sigma, float min_score, int method, int offset);*/\ntemplate <typename T>\nvoid softnms_parrots(T& ctx, const SSElement& attr,\n                     const OperatorBase::in_list_t& ins,\n                     OperatorBase::out_list_t& outs) {\n  float iou_threshold, sigma, min_score;\n  int method, offset;\n  SSAttrs(attr)\n      .get(\"iou_threshold\", iou_threshold)\n      .get(\"sigma\", sigma)\n      .get(\"min_score\", min_score)\n      .get(\"method\", method)\n      .get(\"offset\", offset)\n      .done();\n  at::Tensor boxes, scores, dets;\n  boxes = buildATensor(ctx, ins[0]);\n  scores = buildATensor(ctx, ins[1]);\n  dets = buildATensor(ctx, ins[2]);\n  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,\n                     method, offset);\n  updateDArray(ctx, out, outs[0]);\n}\n\n// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);\ntemplate <typename T>\nvoid nms_match_parrots(T& ctx, const SSElement& attr,\n                       const OperatorBase::in_list_t& ins,\n                       OperatorBase::out_list_t& outs) {\n  float iou_threshold;\n  SSAttrs(attr).get(\"iou_threshold\", iou_threshold).done();\n  at::Tensor dets;\n  dets = buildATensor(ctx, ins[0]);\n  auto out = nms_match(dets, iou_threshold);\n  int n = out.size(), m = 0;\n  for (int i = 0; i < n; ++i)\n    if (m < out[i].size()) m = out[i].size();\n  auto options = torch::TensorOptions().dtype(at::kInt);\n  auto tensor = torch::zeros({n, m}, options);\n  for (int i = 0; i < n; i++)\n    tensor.slice(0, i, i + 1) =\n        torch::from_blob(out[i].data(), {out[i].size()}, options);\n  updateDArray(ctx, tensor, outs[0]);\n}\n\n/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,\n *                    const Tensor dets_sorted, const float iou_threshold,\n *                                       const int multi_label);*/\ntemplate <typename T>\nvoid nms_rotated_parrots(T& ctx, const SSElement& attr,\n                         const OperatorBase::in_list_t& ins,\n                         OperatorBase::out_list_t& outs) {\n  float iou_threshold;\n  int multi_label;\n  SSAttrs(attr)\n      .get(\"iou_threshold\", iou_threshold)\n      .get(\"multi_label\", multi_label)\n      .done();\n  at::Tensor dets, scores, order, dets_sorted;\n  dets = buildATensor(ctx, ins[0]);\n  scores = buildATensor(ctx, ins[1]);\n  order = buildATensor(ctx, ins[2]);\n  dets_sorted = buildATensor(ctx, ins[3]);\n  auto out =\n      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);\n  updateDArray(ctx, out, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(nms)\n    .attr(\"iou_threshold\")\n    .attr(\"offset\")\n    .input(2)\n    .output(1)\n    .apply(nms_parrots<HostContext>)\n#ifdef MMCV_WITH_CUDA\n    .apply(nms_parrots<CudaContext>)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(softnms)\n    .attr(\"iou_threshold\")\n    .attr(\"sigma\")\n    .attr(\"min_score\")\n    .attr(\"method\")\n    .attr(\"offset\")\n    .input(3)\n    .output(1)\n    .apply(softnms_parrots<HostContext>)\n#ifdef MMCV_WITH_CUDA\n    .apply(softnms_parrots<CudaContext>)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(nms_match)\n    .attr(\"iou_threshold\")\n    .input(1)\n    .output(1)\n    .apply(nms_match_parrots<HostContext>)\n#ifdef MMCV_WITH_CUDA\n    .apply(nms_match_parrots<CudaContext>)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(nms_rotated)\n    .attr(\"multi_label\")\n    .attr(\"iou_threshold\")\n    .input(4)\n    .output(1)\n    .apply(nms_rotated_parrots<HostContext>)\n#ifdef MMCV_WITH_CUDA\n    .apply(nms_rotated_parrots<CudaContext>)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/nms_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef NMS_PYTORCH_H\n#define NMS_PYTORCH_H\n#include <torch/extension.h>\n\nat::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,\n               int offset);\n\nat::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,\n                   float iou_threshold, float sigma, float min_score,\n                   int method, int offset);\n\nstd::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);\n\nat::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,\n                       const at::Tensor order, const at::Tensor dets_sorted,\n                       const float iou_threshold, const int multi_label);\n#endif  // NMS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/nms_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h\n#include \"pytorch_cpp_helper.hpp\"\n\nTensor nms_rotated_cpu(const Tensor dets, const Tensor scores,\n                       const float iou_threshold);\n\n#ifdef MMCV_WITH_CUDA\nTensor nms_rotated_cuda(const Tensor dets, const Tensor scores,\n                        const Tensor order, const Tensor dets_sorted,\n                        const float iou_threshold, const int multi_label);\n#endif\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nTensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,\n                   const Tensor dets_sorted, const float iou_threshold,\n                   const int multi_label) {\n  assert(dets.device().is_cuda() == scores.device().is_cuda());\n  if (dets.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,\n                            multi_label);\n#else\n    AT_ERROR(\"Not compiled with GPU support\");\n#endif\n  }\n\n  return nms_rotated_cpu(dets, scores, iou_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/pixel_group.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// It is modified from https://github.com/WenmuZhou/PAN.pytorch\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nstd::vector<std::vector<float>> pixel_group_impl(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {\n  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,\n                              kernel_label, kernel_contour, kernel_region_num,\n                              dis_threshold);\n}\n\nstd::vector<std::vector<float>> pixel_group(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {\n  score = score.contiguous();\n  mask = mask.contiguous();\n  embedding = embedding.contiguous();\n  kernel_label = kernel_label.contiguous();\n  kernel_contour = kernel_contour.contiguous();\n\n  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,\n                          kernel_region_num, distance_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/pixel_group_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"pixel_group_pytorch.h\"\n\nusing namespace parrots;\nusing namespace std;\n\ntemplate <typename T>\nvoid pixel_group_parrots(T& ctx, const SSElement& attr,\n                         const OperatorBase::in_list_t& ins,\n                         OperatorBase::out_list_t& outs) {\n  int kernel_region_num;\n  float distance_threshold;\n  SSAttrs(attr)\n      .get<int>(\"kernel_region_num\", kernel_region_num)\n      .get<float>(\"distance_threshold\", distance_threshold)\n      .done();\n  at::Tensor score;\n  at::Tensor mask;\n  at::Tensor embedding;\n  at::Tensor kernel_label;\n  at::Tensor kernel_contour;\n  score = buildATensor(ctx, ins[0]);\n  mask = buildATensor(ctx, ins[1]);\n  embedding = buildATensor(ctx, ins[2]);\n  kernel_label = buildATensor(ctx, ins[3]);\n  kernel_contour = buildATensor(ctx, ins[4]);\n  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,\n                         kernel_region_num, distance_threshold);\n  int n = out.size();\n  std::vector<float> out_tensor;\n  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));\n  for (int i = 0; i < n; ++i)\n    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());\n  auto options = torch::TensorOptions().dtype(at::kFloat);\n  auto tensor = torch::zeros({1, out_tensor.size()}, options);\n  tensor.slice(0, 0, 1) =\n      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);\n  updateDArray(ctx, tensor, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(pixel_group)\n    .attr(\"kernel_region_num\")\n    .attr(\"distance_threshold\")\n    .input(5)\n    .output(1)\n    .apply(pixel_group_parrots<HostContext>)\n#ifdef MMCV_WITH_CUDA\n    .apply(pixel_group_parrots<CudaContext>)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/pixel_group_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef PIXEL_GROUP_PYTORCH_H\n#define PIXEL_GROUP_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nstd::vector<std::vector<float>> pixel_group(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float distance_threshold);\n\n#endif  // PIXEL_GROUP_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_boxes.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points) {\n  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,\n                       pts_num, boxes, pts, box_idx_of_points);\n}\n\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points) {\n  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,\n                       pts_num, boxes, pts, box_idx_of_points);\n}\n\nvoid points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                  Tensor box_idx_of_points_tensor) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)\n  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),\n  // default -1\n  int batch_size = boxes_tensor.size(0);\n  int boxes_num = boxes_tensor.size(1);\n  int pts_num = pts_tensor.size(1);\n  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,\n                                    boxes_tensor, pts_tensor,\n                                    box_idx_of_points_tensor);\n}\n\nvoid points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor box_idx_of_points_tensor) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]\n  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1\n  int batch_size = boxes_tensor.size(0);\n  int boxes_num = boxes_tensor.size(1);\n  int pts_num = pts_tensor.size(1);\n  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,\n                                   pts_tensor, box_idx_of_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"points_in_boxes_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid points_in_boxes_part_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto boxes_tensor = buildATensor(ctx, ins[0]);\n  auto pts_tensor = buildATensor(ctx, ins[1]);\n\n  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);\n\n  points_in_boxes_part_forward(boxes_tensor, pts_tensor,\n                               box_idx_of_points_tensor);\n}\n\nvoid points_in_boxes_all_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  auto boxes_tensor = buildATensor(ctx, ins[0]);\n  auto pts_tensor = buildATensor(ctx, ins[1]);\n\n  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);\n\n  points_in_boxes_all_forward(boxes_tensor, pts_tensor,\n                              box_idx_of_points_tensor);\n}\n\nPARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)\n    .input(2)\n    .output(1)\n    .apply(points_in_boxes_part_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)\n    .input(2)\n    .output(1)\n    .apply(points_in_boxes_all_forward_cuda_parrots)\n    .done();\n#endif\n\nvoid points_in_boxes_forward_cpu_parrots(HostContext& ctx,\n                                         const SSElement& attr,\n                                         const OperatorBase::in_list_t& ins,\n                                         OperatorBase::out_list_t& outs) {\n  auto boxes_tensor = buildATensor(ctx, ins[0]);\n  auto pts_tensor = buildATensor(ctx, ins[1]);\n\n  auto pts_indices_tensor = buildATensor(ctx, outs[0]);\n\n  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);\n}\n\nPARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)\n    .input(2)\n    .output(1)\n    .apply(points_in_boxes_forward_cpu_parrots)\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINTS_IN_BOXES_PYTORCH_H\n#define POINTS_IN_BOXES_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                  Tensor box_idx_of_points_tensor);\n\nvoid points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor box_idx_of_points_tensor);\n\nvoid points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor pts_indices_tensor);\n\n#endif  // POINTS_IN_BOXES_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_polygons.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols) {\n  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,\n                       output, rows, cols);\n}\n\nvoid points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {\n  int rows = points.size(0);\n  int cols = polygons.size(0);\n  points_in_polygons_forward_impl(points, polygons, output, rows, cols);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"points_in_polygons_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  auto points = buildATensor(ctx, ins[0]);\n  auto polygons = buildATensor(ctx, ins[1]);\n\n  auto output = buildATensor(ctx, outs[0]);\n\n  points_in_polygons_forward(points, polygons, output);\n}\n\nPARROTS_EXTENSION_REGISTER(points_in_polygons_forward)\n    .input(2)\n    .output(1)\n    .apply(points_in_polygons_cuda_parrots)\n    .done();\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef POINTS_IN_POLYGONS_PYTORCH_H\n#define POINTS_IN_POLYGONS_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);\n\n#endif  // POINTS_IN_POLYGONS_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/prroi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_backward_impl(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,\n                       input, rois, grad_rois, pooled_height, pooled_width,\n                       spatial_scale);\n}\n\nvoid prroi_pool_forward(Tensor input, Tensor rois, Tensor output,\n                        int pooled_height, int pooled_width,\n                        float spatial_scale) {\n  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,\n                          spatial_scale);\n}\n\nvoid prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                         int pooled_height, int pooled_width,\n                         float spatial_scale) {\n  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,\n                           pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,\n                              Tensor rois, Tensor grad_rois, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,\n                                pooled_height, pooled_width, spatial_scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"prroi_pool_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid prroi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  prroi_pool_forward(input, rois, output, pooled_height, pooled_width,\n                     spatial_scale);\n}\n\nvoid prroi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  prroi_pool_backward(grad_output, rois, grad_input, pooled_height,\n                      pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_cuda_parrots(CudaContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .done();\n\n  const auto& output = buildATensor(ctx, ins[0]);\n  const auto& grad_output = buildATensor(ctx, ins[1]);\n  const auto& input = buildATensor(ctx, ins[2]);\n  const auto& rois = buildATensor(ctx, ins[3]);\n  auto grad_rois = buildATensor(ctx, outs[0]);\n  prroi_pool_coor_backward(output, grad_output, input, rois, grad_rois,\n                           pooled_height, pooled_width, spatial_scale);\n}\n\nPARROTS_EXTENSION_REGISTER(prroi_pool_forward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .input(2)\n    .output(1)\n    .apply(prroi_pool_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(prroi_pool_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .input(2)\n    .output(1)\n    .apply(prroi_pool_backward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(prroi_pool_coor_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .input(4)\n    .output(1)\n    .apply(prroi_pool_coor_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/prroi_pool_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef PRROI_POOL_PYTORCH_H\n#define PRROI_POOL_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid prroi_pool_forward(Tensor input, Tensor rois, Tensor output,\n                        int pooled_height, int pooled_width,\n                        float spatial_scale);\n\nvoid prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                         int pooled_height, int pooled_width,\n                         float spatial_scale);\n\nvoid prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,\n                              Tensor rois, Tensor grad_rois, int pooled_height,\n                              int pooled_width, float spatial_scale);\n\n#endif  // PRROI_POOL_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/psamask.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/hszhao/semseg/blob/master/lib/psa/src\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,\n                       h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                       half_w_mask);\n}\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask) {\n  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,\n                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                       half_w_mask);\n}\n\nvoid psamask_forward(const Tensor input, Tensor output, const int psa_type,\n                     const int num_, const int h_feature, const int w_feature,\n                     const int h_mask, const int w_mask, const int half_h_mask,\n                     const int half_w_mask) {\n  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,\n                       h_mask, w_mask, half_h_mask, half_w_mask);\n}\n\nvoid psamask_backward(Tensor grad_output, const Tensor grad_input,\n                      const int psa_type, const int num_, const int h_feature,\n                      const int w_feature, const int h_mask, const int w_mask,\n                      const int half_h_mask, const int half_w_mask) {\n  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,\n                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/psamask_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"psamask_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,\n                                  const OperatorBase::in_list_t &ins,\n                                  OperatorBase::out_list_t &outs) {\n  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n      half_w_mask;\n  SSAttrs(attr)\n      .get<int>(\"psa_type\", psa_type)\n      .get<int>(\"num_\", num_)\n      .get<int>(\"h_feature\", h_feature)\n      .get<int>(\"w_feature\", w_feature)\n      .get<int>(\"h_mask\", h_mask)\n      .get<int>(\"w_mask\", w_mask)\n      .get<int>(\"half_h_mask\", half_h_mask)\n      .get<int>(\"half_w_mask\", half_w_mask)\n      .done();\n  const auto &input = buildATensor(ctx, ins[0]);\n  auto output = buildATensor(ctx, outs[0]);\n  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,\n                       h_mask, w_mask, half_h_mask, half_w_mask);\n}\n\nvoid psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,\n                                   const OperatorBase::in_list_t &ins,\n                                   OperatorBase::out_list_t &outs) {\n  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n      half_w_mask;\n  SSAttrs(attr)\n      .get<int>(\"psa_type\", psa_type)\n      .get<int>(\"num_\", num_)\n      .get<int>(\"h_feature\", h_feature)\n      .get<int>(\"w_feature\", w_feature)\n      .get<int>(\"h_mask\", h_mask)\n      .get<int>(\"w_mask\", w_mask)\n      .get<int>(\"half_h_mask\", half_h_mask)\n      .get<int>(\"half_w_mask\", half_w_mask)\n      .done();\n\n  const auto &grad_output = buildATensor(ctx, ins[0]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,\n                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);\n}\n#endif\n\nvoid psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,\n                                 const OperatorBase::in_list_t &ins,\n                                 OperatorBase::out_list_t &outs) {\n  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n      half_w_mask;\n  SSAttrs(attr)\n      .get<int>(\"psa_type\", psa_type)\n      .get<int>(\"num_\", num_)\n      .get<int>(\"h_feature\", h_feature)\n      .get<int>(\"w_feature\", w_feature)\n      .get<int>(\"h_mask\", h_mask)\n      .get<int>(\"w_mask\", w_mask)\n      .get<int>(\"half_h_mask\", half_h_mask)\n      .get<int>(\"half_w_mask\", half_w_mask)\n      .done();\n  const auto &input = buildATensor(ctx, ins[0]);\n  auto output = buildATensor(ctx, outs[0]);\n  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,\n                      h_mask, w_mask, half_h_mask, half_w_mask);\n}\n\nvoid psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,\n                                  const OperatorBase::in_list_t &ins,\n                                  OperatorBase::out_list_t &outs) {\n  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n      half_w_mask;\n  SSAttrs(attr)\n      .get<int>(\"psa_type\", psa_type)\n      .get<int>(\"num_\", num_)\n      .get<int>(\"h_feature\", h_feature)\n      .get<int>(\"w_feature\", w_feature)\n      .get<int>(\"h_mask\", h_mask)\n      .get<int>(\"w_mask\", w_mask)\n      .get<int>(\"half_h_mask\", half_h_mask)\n      .get<int>(\"half_w_mask\", half_w_mask)\n      .done();\n\n  const auto &grad_output = buildATensor(ctx, ins[0]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,\n                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);\n}\n\nPARROTS_EXTENSION_REGISTER(psamask_forward)\n    .attr(\"psa_type\")\n    .attr(\"num_\")\n    .attr(\"h_feature\")\n    .attr(\"w_feature\")\n    .attr(\"h_mask\")\n    .attr(\"w_mask\")\n    .attr(\"half_h_mask\")\n    .attr(\"half_w_mask\")\n    .input(1)\n    .output(1)\n    .apply(psamask_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(psamask_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(psamask_backward)\n    .attr(\"psa_type\")\n    .attr(\"num_\")\n    .attr(\"h_feature\")\n    .attr(\"w_feature\")\n    .attr(\"h_mask\")\n    .attr(\"w_mask\")\n    .attr(\"half_h_mask\")\n    .attr(\"half_w_mask\")\n    .input(1)\n    .output(1)\n    .apply(psamask_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(psamask_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/psamask_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef PSAMASK_PYTORCH_H\n#define PSAMASK_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\n#ifdef MMCV_WITH_CUDA\nvoid psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_cuda(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\n#endif\nvoid psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,\n                         const int num_, const int h_feature,\n                         const int w_feature, const int h_mask,\n                         const int w_mask, const int half_h_mask,\n                         const int half_w_mask);\n\nvoid psamask_backward_cpu(const int psa_type, const Tensor grad_output,\n                          Tensor grad_input, const int num_,\n                          const int h_feature, const int w_feature,\n                          const int h_mask, const int w_mask,\n                          const int half_h_mask, const int half_w_mask);\n#endif  // PSAMASK_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/riroi_align_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid riroi_align_rotated_forward_impl(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise) {\n  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,\n                       pooled_height, pooled_width, spatial_scale, num_samples,\n                       num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise) {\n  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,\n                       bottom_grad, pooled_height, pooled_width, spatial_scale,\n                       num_samples, num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,\n                                 int pooled_height, int pooled_width,\n                                 float spatial_scale, int num_samples,\n                                 int num_orientations, bool clockwise) {\n  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,\n                                   pooled_width, spatial_scale, num_samples,\n                                   num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                  Tensor bottom_grad, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int num_samples, int num_orientations,\n                                  bool clockwise) {\n  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,\n                                    pooled_width, spatial_scale, num_samples,\n                                    num_orientations, clockwise);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"riroi_align_rotated_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid riroi_align_rotated_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sample_num;\n  int num_orientations;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"num_samples\", sample_num)\n      .get<int>(\"num_orientations\", num_orientations)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  auto input = buildATensor(ctx, ins[0]);\n  auto rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,\n                              spatial_scale, sample_num, num_orientations,\n                              clockwise);\n}\n\nvoid riroi_align_rotated_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sample_num;\n  int num_orientations;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"num_samples\", sample_num)\n      .get<int>(\"num_orientations\", num_orientations)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  auto grad_output = buildATensor(ctx, ins[0]);\n  auto rois = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,\n                               pooled_width, spatial_scale, sample_num,\n                               num_orientations, clockwise);\n}\n\nPARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"num_samples\")\n    .attr(\"num_orientations\")\n    .attr(\"clockwise\")\n    .input(2)\n    .output(1)\n    .apply(riroi_align_rotated_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"num_samples\")\n    .attr(\"num_orientations\")\n    .attr(\"clockwise\")\n    .input(2)\n    .output(1)\n    .apply(riroi_align_rotated_backward_cuda_parrots)\n    .done();\n\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H\n#define RIROI_ALIGN_ROTATED_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,\n                                 int pooled_height, int pooled_width,\n                                 float spatial_scale, int num_samples,\n                                 int num_orientations, bool clockwise);\n\nvoid riroi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                  Tensor bottom_grad, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int num_samples, int num_orientations,\n                                  bool clockwise);\n\n#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,\n                       argmax_x, aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,\n                       argmax_x, grad_input, aligned_height, aligned_width,\n                       spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward(Tensor input, Tensor rois, Tensor output,\n                       Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                       int aligned_width, float spatial_scale,\n                       int sampling_ratio, int pool_mode, bool aligned) {\n  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,\n                         aligned_height, aligned_width, spatial_scale,\n                         sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                        Tensor argmax_x, Tensor grad_input, int aligned_height,\n                        int aligned_width, float spatial_scale,\n                        int sampling_ratio, int pool_mode, bool aligned) {\n  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"roi_align_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                    const OperatorBase::in_list_t& ins,\n                                    OperatorBase::out_list_t& outs) {\n  int aligned_height;\n  int aligned_width;\n  float spatial_scale;\n  int sampling_ratio;\n  int pool_mode;\n  bool aligned;\n  SSAttrs(attr)\n      .get<int>(\"aligned_height\", aligned_height)\n      .get<int>(\"aligned_width\", aligned_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<int>(\"pool_mode\", pool_mode)\n      .get<bool>(\"aligned\", aligned)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  auto argmax_y = buildATensor(ctx, outs[1]);\n  auto argmax_x = buildATensor(ctx, outs[2]);\n  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,\n                         aligned_height, aligned_width, spatial_scale,\n                         sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                     const OperatorBase::in_list_t& ins,\n                                     OperatorBase::out_list_t& outs) {\n  int aligned_height;\n  int aligned_width;\n  float spatial_scale;\n  int sampling_ratio;\n  int pool_mode;\n  bool aligned;\n  SSAttrs(attr)\n      .get<int>(\"aligned_height\", aligned_height)\n      .get<int>(\"aligned_width\", aligned_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<int>(\"pool_mode\", pool_mode)\n      .get<bool>(\"aligned\", aligned)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  const auto& argmax_y = buildATensor(ctx, ins[2]);\n  const auto& argmax_x = buildATensor(ctx, ins[3]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n}\n#endif\n\nvoid roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                   const OperatorBase::in_list_t& ins,\n                                   OperatorBase::out_list_t& outs) {\n  int aligned_height;\n  int aligned_width;\n  float spatial_scale;\n  int sampling_ratio;\n  int pool_mode;\n  bool aligned;\n  SSAttrs(attr)\n      .get<int>(\"aligned_height\", aligned_height)\n      .get<int>(\"aligned_width\", aligned_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<int>(\"pool_mode\", pool_mode)\n      .get<bool>(\"aligned\", aligned)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  auto argmax_y = buildATensor(ctx, outs[1]);\n  auto argmax_x = buildATensor(ctx, outs[2]);\n  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,\n                        aligned_width, spatial_scale, sampling_ratio, pool_mode,\n                        aligned);\n}\n\nvoid roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                    const OperatorBase::in_list_t& ins,\n                                    OperatorBase::out_list_t& outs) {\n  int aligned_height;\n  int aligned_width;\n  float spatial_scale;\n  int sampling_ratio;\n  int pool_mode;\n  bool aligned;\n  SSAttrs(attr)\n      .get<int>(\"aligned_height\", aligned_height)\n      .get<int>(\"aligned_width\", aligned_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<int>(\"pool_mode\", pool_mode)\n      .get<bool>(\"aligned\", aligned)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  const auto& argmax_y = buildATensor(ctx, ins[2]);\n  const auto& argmax_x = buildATensor(ctx, ins[3]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,\n                         aligned_height, aligned_width, spatial_scale,\n                         sampling_ratio, pool_mode, aligned);\n}\n\nPARROTS_EXTENSION_REGISTER(roi_align_forward)\n    .attr(\"aligned_height\")\n    .attr(\"aligned_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"pool_mode\")\n    .attr(\"aligned\")\n    .input(2)\n    .output(3)\n    .apply(roi_align_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(roi_align_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(roi_align_backward)\n    .attr(\"aligned_height\")\n    .attr(\"aligned_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"pool_mode\")\n    .attr(\"aligned\")\n    .input(4)\n    .output(1)\n    .apply(roi_align_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(roi_align_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_ALIGN_PYTORCH_H\n#define ROI_ALIGN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n#endif\n\nvoid roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                           int aligned_width, float spatial_scale,\n                           int sampling_ratio, int pool_mode, bool aligned);\n\nvoid roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                            Tensor argmax_x, Tensor grad_input,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\n#endif  // ROI_ALIGN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sample_ratio,\n                                    bool aligned, bool clockwise) {\n  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,\n                       aligned_height, aligned_width, spatial_scale,\n                       sample_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sample_ratio, bool aligned,\n                                     bool clockwise) {\n  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,\n                       bottom_grad, aligned_height, aligned_width,\n                       spatial_scale, sample_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned, bool clockwise) {\n  roi_align_rotated_forward_impl(input, rois, output, aligned_height,\n                                 aligned_width, spatial_scale, sampling_ratio,\n                                 aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                Tensor bottom_grad, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned,\n                                bool clockwise) {\n  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,\n                                  aligned_width, spatial_scale, sampling_ratio,\n                                  aligned, clockwise);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"roi_align_rotated_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,\n                                            const SSElement& attr,\n                                            const OperatorBase::in_list_t& ins,\n                                            OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  bool aligned;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<bool>(\"aligned\", aligned)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,\n                                 pooled_width, spatial_scale, sampling_ratio,\n                                 aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  bool aligned;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<bool>(\"aligned\", aligned)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,\n                                  pooled_width, spatial_scale, sampling_ratio,\n                                  aligned, clockwise);\n}\n#endif\n\nvoid roi_align_rotated_forward_cpu_parrots(HostContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  bool aligned;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<bool>(\"aligned\", aligned)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,\n                                pooled_width, spatial_scale, sampling_ratio,\n                                aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_cpu_parrots(HostContext& ctx,\n                                            const SSElement& attr,\n                                            const OperatorBase::in_list_t& ins,\n                                            OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  int sampling_ratio;\n  bool aligned;\n  bool clockwise;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"sampling_ratio\", sampling_ratio)\n      .get<bool>(\"aligned\", aligned)\n      .get<bool>(\"clockwise\", clockwise)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,\n                                 pooled_width, spatial_scale, sampling_ratio,\n                                 aligned, clockwise);\n}\n\nPARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"aligned\")\n    .attr(\"clockwise\")\n    .input(2)\n    .output(1)\n    .apply(roi_align_rotated_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(roi_align_rotated_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .attr(\"sampling_ratio\")\n    .attr(\"aligned\")\n    .attr(\"clockwise\")\n    .input(2)\n    .output(1)\n    .apply(roi_align_rotated_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(roi_align_rotated_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_ALIGN_ROTATED_PYTORCH_H\n#define ROI_ALIGN_ROTATED_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                                    int pooled_height, int pooled_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,\n                                     Tensor bottom_grad, int pooled_height,\n                                     int pooled_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\n#endif\n\nvoid roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,\n                                    Tensor bottom_grad, int pooled_height,\n                                    int pooled_width, float spatial_scale,\n                                    int sampling_ratio, bool aligned,\n                                    bool clockwise);\n\n#endif  // ROI_ALIGN_ROTATED_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,\n                       grad_input, pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,\n                      int pooled_height, int pooled_width,\n                      float spatial_scale) {\n  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,\n                        pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,\n                       Tensor grad_input, int pooled_height, int pooled_width,\n                       float spatial_scale) {\n  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,\n                         pooled_width, spatial_scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_pool_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"roi_pool_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                   const OperatorBase::in_list_t& ins,\n                                   OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  auto argmax = buildATensor(ctx, outs[1]);\n  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,\n                        pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                    const OperatorBase::in_list_t& ins,\n                                    OperatorBase::out_list_t& outs) {\n  int pooled_height;\n  int pooled_width;\n  float spatial_scale;\n  SSAttrs(attr)\n      .get<int>(\"pooled_height\", pooled_height)\n      .get<int>(\"pooled_width\", pooled_width)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .done();\n\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& rois = buildATensor(ctx, ins[1]);\n  const auto& argmax = buildATensor(ctx, ins[2]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,\n                         pooled_width, spatial_scale);\n}\n\nPARROTS_EXTENSION_REGISTER(roi_pool_forward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .input(2)\n    .output(2)\n    .apply(roi_pool_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(roi_pool_backward)\n    .attr(\"pooled_height\")\n    .attr(\"pooled_width\")\n    .attr(\"spatial_scale\")\n    .input(3)\n    .output(1)\n    .apply(roi_pool_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roi_pool_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROI_POOL_PYTORCH_H\n#define ROI_POOL_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\n#ifdef MMCV_WITH_CUDA\nvoid roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\n\nvoid roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\n#endif\n#endif  // ROI_POOL_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roiaware_pool3d.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method) {\n  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,\n                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,\n                       pts, pts_feature, argmax, pts_idx_of_voxels,\n                       pooled_features, pool_method);\n}\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method) {\n  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,\n                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,\n                       argmax, grad_out, grad_in, pool_method);\n}\n\nvoid roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,\n                             Tensor argmax, Tensor pts_idx_of_voxels,\n                             Tensor pooled_features, int pool_method) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR\n  // coordinate\n  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate\n  // params pts_feature: (npoints, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params pooled_features: (N, out_x, out_y, out_z, C)\n  // params pool_method: 0: max_pool 1: avg_pool\n  int boxes_num = rois.size(0);\n  int pts_num = pts.size(0);\n  int channels = pts_feature.size(1);\n  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter\n  int out_x = pts_idx_of_voxels.size(1);\n  int out_y = pts_idx_of_voxels.size(2);\n  int out_z = pts_idx_of_voxels.size(3);\n  assert((out_x < 256) && (out_y < 256) &&\n         (out_z < 256));  // we encode index with 8bit\n\n  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,\n                               out_x, out_y, out_z, rois, pts, pts_feature,\n                               argmax, pts_idx_of_voxels, pooled_features,\n                               pool_method);\n}\n\nvoid roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,\n                              Tensor grad_out, Tensor grad_in,\n                              int pool_method) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n  // params pool_method: 0: max_pool 1: avg_pool\n  int boxes_num = pts_idx_of_voxels.size(0);\n  int out_x = pts_idx_of_voxels.size(1);\n  int out_y = pts_idx_of_voxels.size(2);\n  int out_z = pts_idx_of_voxels.size(3);\n  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter\n  int channels = grad_out.size(4);\n\n  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,\n                                max_pts_each_voxel, pts_idx_of_voxels, argmax,\n                                grad_out, grad_in, pool_method);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"roiaware_pool3d_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,\n                                          const SSElement& attr,\n                                          const OperatorBase::in_list_t& ins,\n                                          OperatorBase::out_list_t& outs) {\n  int pool_method;\n  SSAttrs(attr).get<int>(\"pool_method\", pool_method).done();\n  auto rois = buildATensor(ctx, ins[0]);\n  auto pts = buildATensor(ctx, ins[1]);\n  auto pts_feature = buildATensor(ctx, ins[2]);\n\n  auto argmax = buildATensor(ctx, outs[0]);\n  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);\n  auto pooled_features = buildATensor(ctx, outs[2]);\n\n  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,\n                          pooled_features, pool_method);\n}\n\nvoid roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  int pool_method;\n  SSAttrs(attr).get<int>(\"pool_method\", pool_method).done();\n  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);\n  auto argmax = buildATensor(ctx, ins[1]);\n  auto grad_out = buildATensor(ctx, ins[2]);\n\n  auto grad_in = buildATensor(ctx, outs[0]);\n\n  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,\n                           pool_method);\n}\n\nPARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)\n    .attr(\"pool_method\")\n    .input(3)\n    .output(3)\n    .apply(roiaware_pool3d_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)\n    .attr(\"pool_method\")\n    .input(3)\n    .output(1)\n    .apply(roiaware_pool3d_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIAWARE_POOL3D_PYTORCH_H\n#define ROIAWARE_POOL3D_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,\n                             Tensor argmax, Tensor pts_idx_of_voxels,\n                             Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,\n                              Tensor grad_out, Tensor grad_in, int pool_method);\n\n#endif  // ROIAWARE_POOL3D_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roipoint_pool3d.cpp",
    "content": "/*\nModified from\nhttps://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp\nPoint cloud feature pooling\nWritten by Shaoshuai Shi\nAll Rights Reserved 2018.\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag) {\n  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,\n                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,\n                       pts_feature, pooled_features, pooled_empty_flag);\n}\n\nvoid roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,\n                             Tensor pooled_features, Tensor pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params boxes3d: (B, M, 7)\n  // params pts_feature: (B, N, C)\n  // params pooled_features: (B, M, 512, 3+C)\n  // params pooled_empty_flag: (B, M)\n  int batch_size = xyz.size(0);\n  int pts_num = xyz.size(1);\n  int boxes_num = boxes3d.size(1);\n  int feature_in_len = pts_feature.size(2);\n  int sampled_pts_num = pooled_features.size(2);\n\n  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,\n                               sampled_pts_num, xyz, boxes3d, pts_feature,\n                               pooled_features, pooled_empty_flag);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"roipoint_pool3d_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,\n                                          const SSElement& attr,\n                                          const OperatorBase::in_list_t& ins,\n                                          OperatorBase::out_list_t& outs) {\n  auto xyz = buildATensor(ctx, ins[0]);\n  auto boxes3d = buildATensor(ctx, ins[1]);\n  auto pts_feature = buildATensor(ctx, ins[2]);\n\n  auto pooled_features = buildATensor(ctx, outs[0]);\n  auto pooled_empty_flag = buildATensor(ctx, outs[1]);\n\n  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,\n                          pooled_empty_flag);\n}\n\nPARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)\n    .input(3)\n    .output(2)\n    .apply(roipoint_pool3d_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROIPOINT_POOL3D_PYTORCH_H\n#define ROIPOINT_POOL3D_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,\n                             Tensor pooled_features, Tensor pooled_empty_flag);\n\n#endif  // ROIPOINT_POOL3D_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/rotated_feature_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output) {\n  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,\n                       best_bboxes, spatial_scale, points, output);\n}\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad) {\n  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,\n                       best_bboxes, spatial_scale, points, bottom_grad);\n}\n\nvoid rotated_feature_align_forward(const Tensor features,\n                                   const Tensor best_bboxes, Tensor output,\n                                   const float spatial_scale,\n                                   const int points) {\n  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,\n                                     points, output);\n}\n\nvoid rotated_feature_align_backward(const Tensor top_grad,\n                                    const Tensor best_bboxes,\n                                    Tensor bottom_grad,\n                                    const float spatial_scale,\n                                    const int points) {\n  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,\n                                      points, bottom_grad);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"rotated_feature_align_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid rotated_feature_align_forward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float spatial_scale;\n  int points;\n  SSAttrs(attr)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"points\", points)\n      .done();\n\n  auto features = buildATensor(ctx, ins[0]);\n  auto best_bboxes = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,\n                                points);\n}\n\nvoid rotated_feature_align_backward_cuda_parrots(\n    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float spatial_scale;\n  int points;\n  SSAttrs(attr)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"points\", points)\n      .done();\n\n  auto grad_output = buildATensor(ctx, ins[0]);\n  auto best_bboxes = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,\n                                 spatial_scale, points);\n}\n#endif\n\nvoid rotated_feature_align_forward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float spatial_scale;\n  int points;\n  SSAttrs(attr)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"points\", points)\n      .done();\n\n  auto features = buildATensor(ctx, ins[0]);\n  auto best_bboxes = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,\n                                points);\n}\n\nvoid rotated_feature_align_backward_cpu_parrots(\n    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,\n    OperatorBase::out_list_t& outs) {\n  float spatial_scale;\n  int points;\n  SSAttrs(attr)\n      .get<float>(\"spatial_scale\", spatial_scale)\n      .get<int>(\"points\", points)\n      .done();\n\n  auto grad_output = buildATensor(ctx, ins[0]);\n  auto best_bboxes = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,\n                                 spatial_scale, points);\n}\n\nPARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)\n    .attr(\"spatial_scale\")\n    .attr(\"points\")\n    .input(2)\n    .output(1)\n    .apply(rotated_feature_align_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(rotated_feature_align_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)\n    .attr(\"spatial_scale\")\n    .attr(\"points\")\n    .input(2)\n    .output(1)\n    .apply(rotated_feature_align_backward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(rotated_feature_align_backward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H\n#define ROTATED_FEATURE_ALIGN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid rotated_feature_align_forward(const Tensor features,\n                                   const Tensor best_bboxes, Tensor output,\n                                   const float spatial_scale, const int points);\n\nvoid rotated_feature_align_backward(const Tensor top_grad,\n                                    const Tensor best_bboxes,\n                                    Tensor bottom_grad,\n                                    const float spatial_scale,\n                                    const int points);\n\n#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/sync_bn.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);\n}\n\nvoid sync_bn_forward_var_impl(const Tensor input, const Tensor mean,\n                              Tensor var) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);\n}\n\nvoid sync_bn_forward_output_impl(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,\n                       running_mean, running_var, weight, bias, norm, std,\n                       output, eps, momentum, group_size);\n}\n\nvoid sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias) {\n  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,\n                       grad_weight, grad_bias);\n}\n\nvoid sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input) {\n  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,\n                       grad_weight, grad_bias, norm, std, grad_input);\n}\n\nvoid sync_bn_forward_mean(const Tensor input, Tensor mean) {\n  sync_bn_forward_mean_impl(input, mean);\n}\n\nvoid sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {\n  sync_bn_forward_var_impl(input, mean, var);\n}\n\nvoid sync_bn_forward_output(const Tensor input, const Tensor mean,\n                            const Tensor var, const Tensor weight,\n                            const Tensor bias, Tensor running_mean,\n                            Tensor running_var, Tensor norm, Tensor std,\n                            Tensor output, float eps, float momentum,\n                            int group_size) {\n  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,\n                              weight, bias, norm, std, output, eps, momentum,\n                              group_size);\n}\n\nvoid sync_bn_backward_param(const Tensor grad_output, const Tensor norm,\n                            Tensor grad_weight, Tensor grad_bias) {\n  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);\n}\n\nvoid sync_bn_backward_data(const Tensor grad_output, const Tensor weight,\n                           const Tensor grad_weight, const Tensor grad_bias,\n                           const Tensor norm, const Tensor std,\n                           Tensor grad_input) {\n  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,\n                             std, grad_input);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/sync_bn_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"sync_bn_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  const auto& input = buildATensor(ctx, ins[0]);\n  auto mean = buildATensor(ctx, outs[0]);\n  sync_bn_forward_mean_cuda(input, mean);\n}\n\nvoid sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                      const OperatorBase::in_list_t& ins,\n                                      OperatorBase::out_list_t& outs) {\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& mean = buildATensor(ctx, ins[1]);\n  auto var = buildATensor(ctx, outs[0]);\n  sync_bn_forward_var_cuda(input, mean, var);\n}\n\nvoid sync_bn_forward_output_cuda_parrots(CudaContext& ctx,\n                                         const SSElement& attr,\n                                         const OperatorBase::in_list_t& ins,\n                                         OperatorBase::out_list_t& outs) {\n  size_t group_size;\n  float eps, momentum;\n  SSAttrs(attr)\n      .get<float>(\"eps\", eps)\n      .get<float>(\"momentum\", momentum)\n      .get<size_t>(\"group_size\", group_size)\n      .done();\n\n  const auto& input = buildATensor(ctx, ins[0]);\n  const auto& mean = buildATensor(ctx, ins[1]);\n  const auto& var = buildATensor(ctx, ins[2]);\n  const auto& weight = buildATensor(ctx, ins[3]);\n  const auto& bias = buildATensor(ctx, ins[4]);\n  auto running_mean = buildATensor(ctx, outs[0]);\n  auto running_var = buildATensor(ctx, outs[1]);\n  auto norm = buildATensor(ctx, outs[2]);\n  auto std = buildATensor(ctx, outs[3]);\n  auto output = buildATensor(ctx, outs[4]);\n  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,\n                              weight, bias, norm, std, output, eps, momentum,\n                              group_size);\n}\n\nvoid sync_bn_backward_param_cuda_parrots(CudaContext& ctx,\n                                         const SSElement& attr,\n                                         const OperatorBase::in_list_t& ins,\n                                         OperatorBase::out_list_t& outs) {\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& norm = buildATensor(ctx, ins[1]);\n  auto grad_weight = buildATensor(ctx, outs[0]);\n  auto grad_bias = buildATensor(ctx, outs[1]);\n  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);\n}\n\nvoid sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  const auto& grad_output = buildATensor(ctx, ins[0]);\n  const auto& weight = buildATensor(ctx, ins[1]);\n  const auto& grad_weight = buildATensor(ctx, ins[2]);\n  const auto& grad_bias = buildATensor(ctx, ins[3]);\n  const auto& norm = buildATensor(ctx, ins[4]);\n  const auto& std = buildATensor(ctx, ins[5]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,\n                             std, grad_input);\n}\n\nPARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)\n    .input(1)\n    .output(1)\n    .apply(sync_bn_forward_mean_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(sync_bn_forward_var)\n    .input(2)\n    .output(1)\n    .apply(sync_bn_forward_var_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(sync_bn_forward_output)\n    .attr(\"eps\")\n    .attr(\"momentum\")\n    .attr(\"group_size\")\n    .input(5)\n    .output(5)\n    .apply(sync_bn_forward_output_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(sync_bn_backward_param)\n    .input(2)\n    .output(2)\n    .apply(sync_bn_backward_param_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(sync_bn_backward_data)\n    .input(6)\n    .output(1)\n    .apply(sync_bn_backward_data_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/sync_bn_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef SYNC_BN_PYTORCH_H\n#define SYNC_BN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);\n\nvoid sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,\n                              Tensor var);\n\nvoid sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size);\n\nvoid sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias);\n\nvoid sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input);\n#endif  // SYNC_BN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_interpolate.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out) {\n  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,\n                       weight, out);\n}\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,\n                       idx, weight, grad_points);\n}\n\nvoid three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,\n                               Tensor weight_tensor, Tensor out_tensor, int b,\n                               int c, int m, int n) {\n  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,\n                                 weight_tensor, out_tensor);\n}\n\nvoid three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                Tensor weight_tensor, Tensor grad_points_tensor,\n                                int b, int c, int n, int m) {\n  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,\n                                  weight_tensor, grad_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"three_interpolate_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid three_interpolate_forward_cuda_parrots(CudaContext& ctx,\n                                            const SSElement& attr,\n                                            const OperatorBase::in_list_t& ins,\n                                            OperatorBase::out_list_t& outs) {\n  int b, c, m, n;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"m\", m)\n      .get<int>(\"n\", n)\n      .done();\n\n  auto points_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n  auto weight_tensor = buildATensor(ctx, ins[2]);\n\n  auto out_tensor = buildATensor(ctx, outs[0]);\n\n  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,\n                            out_tensor, b, c, m, n);\n}\n\nvoid three_interpolate_backward_cuda_parrots(CudaContext& ctx,\n                                             const SSElement& attr,\n                                             const OperatorBase::in_list_t& ins,\n                                             OperatorBase::out_list_t& outs) {\n  int b, c, n, m;\n  SSAttrs(attr)\n      .get<int>(\"b\", b)\n      .get<int>(\"c\", c)\n      .get<int>(\"n\", n)\n      .get<int>(\"m\", m)\n      .done();\n\n  auto grad_out_tensor = buildATensor(ctx, ins[0]);\n  auto idx_tensor = buildATensor(ctx, ins[1]);\n  auto weight_tensor = buildATensor(ctx, ins[2]);\n\n  auto grad_points_tensor = buildATensor(ctx, outs[0]);\n\n  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,\n                             grad_points_tensor, b, c, n, m);\n}\n\nPARROTS_EXTENSION_REGISTER(three_interpolate_forward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"m\")\n    .attr(\"n\")\n    .input(3)\n    .output(1)\n    .apply(three_interpolate_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(three_interpolate_backward)\n    .attr(\"b\")\n    .attr(\"c\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .input(3)\n    .output(1)\n    .apply(three_interpolate_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_interpolate_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_INTERPOLATE_PYTORCH_H\n#define THREE_INTERPOLATE_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,\n                               Tensor weight_tensor, Tensor out_tensor, int b,\n                               int c, int m, int n);\n\nvoid three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                Tensor weight_tensor, Tensor grad_points_tensor,\n                                int b, int c, int n, int m);\n#endif  // THREE_INTERPOLATE_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_nn.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx) {\n  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,\n                       idx);\n}\n\nvoid three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,\n                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,\n                      int m) {\n  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,\n                        idx_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_nn_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"three_nn_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                   const OperatorBase::in_list_t& ins,\n                                   OperatorBase::out_list_t& outs) {\n  int b, n, m;\n  SSAttrs(attr).get<int>(\"b\", b).get<int>(\"n\", n).get<int>(\"m\", m).done();\n\n  auto unknown_tensor = buildATensor(ctx, ins[0]);\n  auto known_tensor = buildATensor(ctx, ins[1]);\n\n  auto dist2_tensor = buildATensor(ctx, outs[0]);\n  auto idx_tensor = buildATensor(ctx, outs[1]);\n\n  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,\n                   m);\n}\n\nPARROTS_EXTENSION_REGISTER(three_nn_forward)\n    .attr(\"b\")\n    .attr(\"n\")\n    .attr(\"m\")\n    .input(2)\n    .output(2)\n    .apply(three_nn_forward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/three_nn_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef THREE_NN_PYTORCH_H\n#define THREE_NN_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,\n                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,\n                      int m);\n#endif  // THREE_NN_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/tin_shift.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {\n  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);\n}\n\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input) {\n  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward(Tensor input, Tensor shift, Tensor output) {\n  tin_shift_forward_impl(input, shift, output);\n}\n\nvoid tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {\n  tin_shift_backward_impl(grad_output, shift, grad_input);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/tin_shift_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"tin_shift_pytorch.h\"\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,\n                                    const OperatorBase::in_list_t &ins,\n                                    OperatorBase::out_list_t &outs) {\n  const auto &input = buildATensor(ctx, ins[0]);\n  const auto &shift = buildATensor(ctx, ins[1]);\n  auto output = buildATensor(ctx, outs[0]);\n  tin_shift_forward_cuda(input, shift, output);\n}\n\nvoid tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,\n                                     const OperatorBase::in_list_t &ins,\n                                     OperatorBase::out_list_t &outs) {\n  const auto &grad_output = buildATensor(ctx, ins[0]);\n  const auto &shift = buildATensor(ctx, ins[1]);\n  auto grad_input = buildATensor(ctx, outs[0]);\n  tin_shift_backward_cuda(grad_output, shift, grad_input);\n}\n\nPARROTS_EXTENSION_REGISTER(tin_shift_forward)\n    .input(2)\n    .output(1)\n    .apply(tin_shift_forward_cuda_parrots)\n    .done();\n\nPARROTS_EXTENSION_REGISTER(tin_shift_backward)\n    .input(2)\n    .output(1)\n    .apply(tin_shift_backward_cuda_parrots)\n    .done();\n#endif\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/tin_shift_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef TIN_SHIFT_PYTORCH_H\n#define TIN_SHIFT_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);\n\nvoid tin_shift_backward_cuda(Tensor grad_output, Tensor shift,\n                             Tensor grad_input);\n#endif  // TIN_SHIFT_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/upfirdn2d.cpp",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp\n\n/*\nCopyright (c) 2021, NVIDIA Corporation. All rights reserved.\n\nNVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator\nAugmentation (ADA)\n=======================================================================\n\n1. Definitions\n\n\"Licensor\" means any person or entity that distributes its Work.\n\n\"Software\" means the original work of authorship made available under\nthis License.\n\n\"Work\" means the Software and any additions to or derivative works of\nthe Software that are made available under this License.\n\nThe terms \"reproduce,\" \"reproduction,\" \"derivative works,\" and\n\"distribution\" have the meaning as provided under U.S. copyright law;\nprovided, however, that for the purposes of this License, derivative\nworks shall not include works that remain separable from, or merely\nlink (or bind by name) to the interfaces of, the Work.\n\nWorks, including the Software, are \"made available\" under this License\nby including in or with the Work either (a) a copyright notice\nreferencing the applicability of this License to the Work, or (b) a\ncopy of this License.\n\n2. License Grants\n\n    2.1 Copyright Grant. Subject to the terms and conditions of this\n    License, each Licensor grants to you a perpetual, worldwide,\n    non-exclusive, royalty-free, copyright license to reproduce,\n    prepare derivative works of, publicly display, publicly perform,\n    sublicense and distribute its Work and any resulting derivative\n    works in any form.\n\n3. Limitations\n\n    3.1 Redistribution. You may reproduce or distribute the Work only\n    if (a) you do so under this License, (b) you include a complete\n    copy of this License with your distribution, and (c) you retain\n    without modification any copyright, patent, trademark, or\n    attribution notices that are present in the Work.\n\n    3.2 Derivative Works. You may specify that additional or different\n    terms apply to the use, reproduction, and distribution of your\n    derivative works of the Work (\"Your Terms\") only if (a) Your Terms\n    provide that the use limitation in Section 3.3 applies to your\n    derivative works, and (b) you identify the specific derivative\n    works that are subject to Your Terms. Notwithstanding Your Terms,\n    this License (including the redistribution requirements in Section\n    3.1) will continue to apply to the Work itself.\n\n    3.3 Use Limitation. The Work and any derivative works thereof only\n    may be used or intended for use non-commercially. Notwithstanding\n    the foregoing, NVIDIA and its affiliates may use the Work and any\n    derivative works commercially. As used herein, \"non-commercially\"\n    means for research or evaluation purposes only.\n\n    3.4 Patent Claims. If you bring or threaten to bring a patent claim\n    against any Licensor (including any claim, cross-claim or\n    counterclaim in a lawsuit) to enforce any patents that you allege\n    are infringed by any Work, then your rights under this License from\n    such Licensor (including the grant in Section 2.1) will terminate\n    immediately.\n\n    3.5 Trademarks. This License does not grant any rights to use any\n    Licensor’s or its affiliates’ names, logos, or trademarks, except\n    as necessary to reproduce the notices described in this License.\n\n    3.6 Termination. If you violate any term of this License, then your\n    rights under this License (including the grant in Section 2.1) will\n    terminate immediately.\n\n4. Disclaimer of Warranty.\n\nTHE WORK IS PROVIDED \"AS IS\" WITHOUT WARRANTIES OR CONDITIONS OF ANY\nKIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR\nNON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER\nTHIS LICENSE.\n\n5. Limitation of Liability.\n\nEXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL\nTHEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE\nSHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,\nINDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF\nOR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK\n(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,\nLOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER\nCOMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF\nTHE POSSIBILITY OF SUCH DAMAGES.\n\n=======================================================================\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,\n                                const torch::Tensor& kernel, int up_x, int up_y,\n                                int down_x, int down_y, int pad_x0, int pad_x1,\n                                int pad_y0, int pad_y1) {\n  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,\n                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);\n}\n\ntorch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,\n                        int up_x, int up_y, int down_x, int down_y, int pad_x0,\n                        int pad_x1, int pad_y0, int pad_y1) {\n  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,\n                           pad_x1, pad_y0, pad_y1);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <torch/extension.h>\n\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\nusing namespace at;\nusing namespace parrots;\n\ntorch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,\n                        int up_y, int down_x, int down_y, int pad_x0,\n                        int pad_x1, int pad_y0, int pad_y1);\n\nvoid upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,\n                       const OperatorBase::in_list_t &ins,\n                       OperatorBase::out_list_t &outs) {\n  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;\n  const auto &input = buildATensor(ctx, ins[0]);\n  const auto &kernel = buildATensor(ctx, ins[1]);\n  SSAttrs(attr)\n      .get(\"up_x\", up_x)\n      .get(\"up_y\", up_y)\n      .get(\"down_x\", down_x)\n      .get(\"down_y\", down_y)\n      .get(\"pad_x0\", pad_x0)\n      .get(\"pad_x1\", pad_x1)\n      .get(\"pad_y0\", pad_y0)\n      .get(\"pad_y1\", pad_y1)\n      .done();\n  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,\n                       pad_x1, pad_y0, pad_y1);\n  updateDArray(ctx, out, outs[0]);\n}\n\nPARROTS_EXTENSION_REGISTER(upfirdn2d)\n    .attr(\"up_x\")\n    .attr(\"up_y\")\n    .attr(\"down_x\")\n    .attr(\"down_y\")\n    .attr(\"pad_x0\")\n    .attr(\"pad_x1\")\n    .attr(\"pad_y0\")\n    .attr(\"pad_y1\")\n    .input(2)\n    .output(1)\n    .apply(upfirdn2d_parrots)\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/voxelization.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim = 3) {\n  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,\n                              num_points_per_voxel, voxel_size, coors_range,\n                              max_points, max_voxels, NDim);\n}\n\nint nondeterministic_hard_voxelize_forward_impl(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,\n                              points, voxels, coors, num_points_per_voxel,\n                              voxel_size, coors_range, max_points, max_voxels,\n                              NDim);\n}\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim = 3) {\n  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,\n                       coors_range, NDim);\n}\n\nvoid hard_voxelize_forward(const at::Tensor &points,\n                           const at::Tensor &voxel_size,\n                           const at::Tensor &coors_range, at::Tensor &voxels,\n                           at::Tensor &coors, at::Tensor &num_points_per_voxel,\n                           at::Tensor &voxel_num, const int max_points,\n                           const int max_voxels, const int NDim = 3,\n                           const bool deterministic = true) {\n  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();\n  std::vector<float> voxel_size_v(\n      voxel_size.data_ptr<float>(),\n      voxel_size.data_ptr<float>() + voxel_size.numel());\n  std::vector<float> coors_range_v(\n      coors_range.data_ptr<float>(),\n      coors_range.data_ptr<float>() + coors_range.numel());\n\n  if (deterministic) {\n    *voxel_num_data = hard_voxelize_forward_impl(\n        points, voxels, coors, num_points_per_voxel, voxel_size_v,\n        coors_range_v, max_points, max_voxels, NDim);\n  } else {\n    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(\n        points, voxels, coors, num_points_per_voxel, voxel_size_v,\n        coors_range_v, max_points, max_voxels, NDim);\n  }\n}\n\nvoid dynamic_voxelize_forward(const at::Tensor &points,\n                              const at::Tensor &voxel_size,\n                              const at::Tensor &coors_range, at::Tensor &coors,\n                              const int NDim = 3) {\n  std::vector<float> voxel_size_v(\n      voxel_size.data_ptr<float>(),\n      voxel_size.data_ptr<float>() + voxel_size.numel());\n  std::vector<float> coors_range_v(\n      coors_range.data_ptr<float>(),\n      coors_range.data_ptr<float>() + coors_range.numel());\n  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,\n                                NDim);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/voxelization_parrots.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <parrots/compute/aten.hpp>\n#include <parrots/extension.hpp>\n#include <parrots/foundation/ssattrs.hpp>\n\n#include \"voxelization_pytorch.h\"\n\nusing namespace parrots;\n\n#ifdef MMCV_WITH_CUDA\nvoid hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,\n                                        const OperatorBase::in_list_t& ins,\n                                        OperatorBase::out_list_t& outs) {\n  int max_points, max_voxels, NDim;\n  bool deterministic;\n  SSAttrs(attr)\n      .get<int>(\"max_points\", max_points)\n      .get<int>(\"max_voxels\", max_voxels)\n      .get<int>(\"NDim\", NDim)\n      .get<bool>(\"deterministic\", deterministic)\n      .done();\n  const auto& points = buildATensor(ctx, ins[0]);\n  const auto& voxel_size = buildATensor(ctx, ins[1]);\n  const auto& coors_range = buildATensor(ctx, ins[2]);\n\n  auto voxels = buildATensor(ctx, outs[0]);\n  auto coors = buildATensor(ctx, outs[1]);\n  auto num_points_per_voxel = buildATensor(ctx, outs[2]);\n  auto voxel_num = buildATensor(ctx, outs[3]);\n\n  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,\n                        num_points_per_voxel, voxel_num, max_points, max_voxels,\n                        NDim, deterministic);\n}\n\nvoid dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,\n                                           const SSElement& attr,\n                                           const OperatorBase::in_list_t& ins,\n                                           OperatorBase::out_list_t& outs) {\n  int NDim;\n  SSAttrs(attr).get<int>(\"NDim\", NDim).done();\n  const auto& points = buildATensor(ctx, ins[0]);\n  const auto& voxel_size = buildATensor(ctx, ins[1]);\n  const auto& coors_range = buildATensor(ctx, ins[2]);\n\n  auto coors = buildATensor(ctx, outs[0]);\n\n  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);\n}\n#endif\n\nvoid hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,\n                                       const OperatorBase::in_list_t& ins,\n                                       OperatorBase::out_list_t& outs) {\n  int max_points, max_voxels, NDim;\n  bool deterministic;\n  SSAttrs(attr)\n      .get<int>(\"max_points\", max_points)\n      .get<int>(\"max_voxels\", max_voxels)\n      .get<int>(\"NDim\", NDim)\n      .get<bool>(\"deterministic\", deterministic)\n      .done();\n  const auto& points = buildATensor(ctx, ins[0]);\n  const auto& voxel_size = buildATensor(ctx, ins[1]);\n  const auto& coors_range = buildATensor(ctx, ins[2]);\n\n  auto voxels = buildATensor(ctx, outs[0]);\n  auto coors = buildATensor(ctx, outs[1]);\n  auto num_points_per_voxel = buildATensor(ctx, outs[2]);\n  auto voxel_num = buildATensor(ctx, outs[3]);\n\n  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,\n                        num_points_per_voxel, voxel_num, max_points, max_voxels,\n                        NDim, deterministic);\n}\n\nvoid dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,\n                                          const SSElement& attr,\n                                          const OperatorBase::in_list_t& ins,\n                                          OperatorBase::out_list_t& outs) {\n  int NDim;\n  SSAttrs(attr).get<int>(\"NDim\", NDim).done();\n  const auto& points = buildATensor(ctx, ins[0]);\n  const auto& voxel_size = buildATensor(ctx, ins[1]);\n  const auto& coors_range = buildATensor(ctx, ins[2]);\n\n  auto coors = buildATensor(ctx, outs[0]);\n\n  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);\n}\n\nPARROTS_EXTENSION_REGISTER(hard_voxelize_forward)\n    .attr(\"max_points\")\n    .attr(\"max_voxels\")\n    .attr(\"NDim\")\n    .attr(\"deterministic\")\n    .input(3)\n    .output(4)\n    .apply(hard_voxelize_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(hard_voxelize_forward_cuda_parrots)\n#endif\n    .done();\n\nPARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)\n    .attr(\"NDim\")\n    .input(3)\n    .output(1)\n    .apply(dynamic_voxelize_forward_cpu_parrots)\n#ifdef MMCV_WITH_CUDA\n    .apply(dynamic_voxelize_forward_cuda_parrots)\n#endif\n    .done();\n"
  },
  {
    "path": "mmcv/ops/csrc/parrots/voxelization_pytorch.h",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#ifndef VOXELIZATION_PYTORCH_H\n#define VOXELIZATION_PYTORCH_H\n#include <torch/extension.h>\nusing namespace at;\n\nvoid hard_voxelize_forward(const at::Tensor &points,\n                           const at::Tensor &voxel_size,\n                           const at::Tensor &coors_range, at::Tensor &voxels,\n                           at::Tensor &coors, at::Tensor &num_points_per_voxel,\n                           at::Tensor &voxel_num, const int max_points,\n                           const int max_voxels, const int NDim = 3,\n                           const bool deterministic = true);\n\nvoid dynamic_voxelize_forward(const at::Tensor &points,\n                              const at::Tensor &voxel_size,\n                              const at::Tensor &coors_range, at::Tensor &coors,\n                              const int NDim = 3);\n\n#endif  // VOXELIZATION_PYTORCH_H\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/active_rotated_filter.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output) {\n  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,\n                       output);\n}\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in) {\n  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,\n                       grad_in);\n}\n\nvoid active_rotated_filter_forward(const Tensor input, const Tensor indices,\n                                   Tensor output) {\n  active_rotated_filter_forward_impl(input, indices, output);\n}\n\nvoid active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,\n                                    Tensor grad_in) {\n  active_rotated_filter_backward_impl(grad_out, indices, grad_in);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/assign_score_withk.cpp",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor& points,\n                                     const Tensor& centers,\n                                     const Tensor& scores,\n                                     const Tensor& knn_idx, Tensor& output) {\n  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,\n                       aggregate, points, centers, scores, knn_idx, output);\n}\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,\n                       aggregate, grad_out, points, centers, scores, knn_idx,\n                       grad_points, grad_centers, grad_scores);\n}\n\nvoid assign_score_withk_forward(const Tensor& points, const Tensor& centers,\n                                const Tensor& scores, const Tensor& knn_idx,\n                                Tensor& output, int B, int N0, int N1, int M,\n                                int K, int O, int aggregate) {\n  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,\n                                  centers, scores, knn_idx, output);\n}\n\nvoid assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,\n                                 const Tensor& centers, const Tensor& scores,\n                                 const Tensor& knn_idx, Tensor& grad_points,\n                                 Tensor& grad_centers, Tensor& grad_scores,\n                                 int B, int N0, int N1, int M, int K, int O,\n                                 int aggregate) {\n  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,\n                                   points, centers, scores, knn_idx,\n                                   grad_points, grad_centers, grad_scores);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/ball_query.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx) {\n  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,\n                       nsample, new_xyz, xyz, idx);\n}\n\nvoid ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,\n                        Tensor idx_tensor, int b, int n, int m,\n                        float min_radius, float max_radius, int nsample) {\n  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,\n                          new_xyz_tensor, xyz_tensor, idx_tensor);\n}\n\nvoid stack_ball_query_forward_impl(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx) {\n  DISPATCH_DEVICE_IMPL(stack_ball_query_forward_impl, max_radius, nsample,\n                       new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);\n}\n\nvoid stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,\n                              Tensor xyz_tensor, Tensor xyz_batch_cnt,\n                              Tensor idx_tensor, float max_radius,\n                              int nsample) {\n  stack_ball_query_forward_impl(max_radius, nsample, new_xyz_tensor,\n                                new_xyz_batch_cnt, xyz_tensor, xyz_batch_cnt,\n                                idx_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/bbox_overlaps.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n\n#include \"csrc_dipu/base/basedef.h\"\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset) {\n  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,\n                       aligned, offset);\n}\n\n#ifdef MMCV_WITH_DIOPI\nvoid bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2,\n                         Tensor ious, const int mode, const bool aligned,\n                         const int offset) {\n  auto bboxes1_p = toDiopiTensorHandle(bboxes1);\n  diopiDevice_t device;\n  diopiGetTensorDevice(bboxes1_p, &device);\n  if (device == diopi_host) {\n    bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto bboxes2_p = toDiopiTensorHandle(bboxes2);\n  auto ious_p = toDiopiTensorHandle(ious);\n  bool is_mock_cuda = bboxes1.device().type() == dipu::DIPU_DEVICE_TYPE;\n  if (is_mock_cuda &&\n      reinterpret_cast<void *>(diopiBboxOverlapsMmcv) != nullptr) {\n    auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode,\n                                     offset, aligned);\n    if (ret == diopiSuccess) return;\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op bbox_overlaps\";\n  auto bboxes1_cpu = bboxes1.cpu();\n  auto bboxes2_cpu = bboxes2.cpu();\n  auto ious_cpu = ious.cpu();\n  bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset);\n  ious.copy_(ious_cpu);\n}\n#endif\n\nvoid bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                   const int mode, const bool aligned, const int offset) {\n#ifdef MMCV_WITH_DIOPI\n  bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset);\n#else\n  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);\n#endif\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/bezier_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned) {\n  DISPATCH_DEVICE_IMPL(bezier_align_forward_impl, input, rois, output,\n                       aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, aligned);\n}\n\nvoid bezier_align_backward_impl(Tensor grad_output, Tensor rois,\n                                Tensor grad_input, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned) {\n  DISPATCH_DEVICE_IMPL(bezier_align_backward_impl, grad_output, rois,\n                       grad_input, aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, aligned);\n}\n\nvoid bezier_align_forward(Tensor input, Tensor rois, Tensor output,\n                          int aligned_height, int aligned_width,\n                          float spatial_scale, int sampling_ratio,\n                          bool aligned) {\n  bezier_align_forward_impl(input, rois, output, aligned_height, aligned_width,\n                            spatial_scale, sampling_ratio, aligned);\n}\n\nvoid bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                           int aligned_height, int aligned_width,\n                           float spatial_scale, int sampling_ratio,\n                           bool aligned) {\n  bezier_align_backward_impl(grad_output, rois, grad_input, aligned_height,\n                             aligned_width, spatial_scale, sampling_ratio,\n                             aligned);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/bias_act.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor bias_act_op_impl(const torch::Tensor &input,\n                               const torch::Tensor &bias,\n                               const torch::Tensor &xref,\n                               const torch::Tensor &yref,\n                               const torch::Tensor &dy, int grad, int dim,\n                               int act, float alpha, float gain, float clamp) {\n  return DISPATCH_DEVICE_IMPL(bias_act_op_impl, input, bias, xref, yref, dy,\n                              grad, dim, act, alpha, gain, clamp);\n}\n\ntorch::Tensor bias_act(const torch::Tensor &input, const torch::Tensor &bias,\n                       const torch::Tensor &xref, const torch::Tensor &yref,\n                       const torch::Tensor &dy, int grad, int dim, int act,\n                       float alpha, float gain, float clamp) {\n  return bias_act_op_impl(input, bias, xref, yref, dy, grad, dim, act, alpha,\n                          gain, clamp);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/border_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid border_align_forward_impl(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size) {\n  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,\n                       argmax_idx, pool_size);\n}\n\nvoid border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size) {\n  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,\n                       argmax_idx, grad_input, pool_size);\n}\n\nvoid border_align_forward(const Tensor &input, const Tensor &boxes,\n                          Tensor output, Tensor argmax_idx,\n                          const int pool_size) {\n  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);\n}\n\nvoid border_align_backward(const Tensor &grad_output, const Tensor &boxes,\n                           const Tensor &argmax_idx, Tensor grad_input,\n                           const int pool_size) {\n  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,\n                             pool_size);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/box_iou_quadri.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  DISPATCH_DEVICE_IMPL(box_iou_quadri_impl, boxes1, boxes2, ious, mode_flag,\n                       aligned);\n}\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nvoid box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                    const int mode_flag, const bool aligned) {\n  box_iou_quadri_impl(boxes1, boxes2, ious, mode_flag, aligned);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/box_iou_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned) {\n  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,\n                       aligned);\n}\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nvoid box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                     const int mode_flag, const bool aligned) {\n  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/carafe.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,\n                       rmasks, output, kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,\n                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,\n                       bottom_grad, mask_grad, kernel_size, group_size,\n                       scale_factor);\n}\n\nvoid carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,\n                    Tensor routput, Tensor rmasks, Tensor output,\n                    int kernel_size, int group_size, int scale_factor) {\n  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,\n                      kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                     Tensor rtop_grad, Tensor rbottom_grad_hs,\n                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,\n                     Tensor mask_grad, int kernel_size, int group_size,\n                     int scale_factor) {\n  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,\n                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,\n                       kernel_size, group_size, scale_factor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/carafe_naive.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,\n                       kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor) {\n  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,\n                       bottom_grad, mask_grad, kernel_size, group_size,\n                       scale_factor);\n}\n\nvoid carafe_naive_forward(Tensor features, Tensor masks, Tensor output,\n                          int kernel_size, int group_size, int scale_factor) {\n  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,\n                            scale_factor);\n}\n\nvoid carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,\n                           Tensor bottom_grad, Tensor mask_grad,\n                           int kernel_size, int group_size, int scale_factor) {\n  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,\n                             kernel_size, group_size, scale_factor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/chamfer_distance.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2) {\n  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,\n                       idx1, idx2);\n}\n\nvoid chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2) {\n  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,\n                       graddist1, graddist2, gradxyz1, gradxyz2);\n}\n\nvoid chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,\n                              const Tensor dist1, const Tensor dist2,\n                              const Tensor idx1, const Tensor idx2) {\n  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);\n}\n\nvoid chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,\n                               Tensor idx1, Tensor idx2, Tensor graddist1,\n                               Tensor graddist2, Tensor gradxyz1,\n                               Tensor gradxyz2) {\n  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,\n                                 gradxyz1, gradxyz2);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/contour_expand.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// It is modified from https://github.com/whai362/PSENet\n#include <iostream>\n#include <queue>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nusing namespace std;\n\nclass Point2d {\n public:\n  int x;\n  int y;\n\n  Point2d() : x(0), y(0) {}\n  Point2d(int _x, int _y) : x(_x), y(_y) {}\n};\n\nvoid kernel_dilate(const uint8_t *data, IntArrayRef data_shape,\n                   const int *label_map, int &label_num, int &min_area,\n                   vector<vector<int>> &text_line) {\n  std::vector<int> area(label_num + 1);\n  int kernel_num = data_shape[0];\n  int height = data_shape[1];\n  int width = data_shape[2];\n\n  for (int x = 0; x < height; ++x) {\n    for (int y = 0; y < width; ++y) {\n      int label = label_map[x * width + y];\n      if (label == 0) continue;\n      area[label] += 1;\n    }\n  }\n\n  queue<Point2d> queue, next_queue;\n  for (int x = 0; x < height; ++x) {\n    vector<int> row(width);\n    for (int y = 0; y < width; ++y) {\n      int label = label_map[x * width + y];\n      if (label == 0) continue;\n      if (area[label] < min_area) continue;\n\n      Point2d point(x, y);\n      queue.push(point);\n      row[y] = label;\n    }\n    text_line.emplace_back(row);\n  }\n\n  int dx[] = {-1, 1, 0, 0};\n  int dy[] = {0, 0, -1, 1};\n  vector<int> kernel_step(kernel_num);\n  std::for_each(kernel_step.begin(), kernel_step.end(),\n                [=](int &k) { return k * height * width; });\n\n  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {\n    while (!queue.empty()) {\n      Point2d point = queue.front();\n      queue.pop();\n      int x = point.x;\n      int y = point.y;\n      int label = text_line[x][y];\n\n      bool is_edge = true;\n      for (int d = 0; d < 4; ++d) {\n        int tmp_x = x + dx[d];\n        int tmp_y = y + dy[d];\n\n        if (tmp_x < 0 || tmp_x >= height) continue;\n        if (tmp_y < 0 || tmp_y >= width) continue;\n        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];\n        if (kernel_value == 0) continue;\n        if (text_line[tmp_x][tmp_y] > 0) continue;\n\n        Point2d point(tmp_x, tmp_y);\n        queue.push(point);\n        text_line[tmp_x][tmp_y] = label;\n        is_edge = false;\n      }\n\n      if (is_edge) {\n        next_queue.push(point);\n      }\n    }\n    swap(queue, next_queue);\n  }\n}\n\nstd::vector<std::vector<int>> contour_expand(Tensor kernel_mask,\n                                             Tensor internal_kernel_label,\n                                             int min_kernel_area,\n                                             int kernel_num) {\n  kernel_mask = kernel_mask.contiguous();\n  internal_kernel_label = internal_kernel_label.contiguous();\n  assert(kernel_mask.dim() == 3);\n  assert(internal_kernel_label.dim() == 2);\n  assert(kernel_mask.size(1) == internal_kernel_label.size(0));\n  assert(kernel_mask.size(2) == internal_kernel_label.size(1));\n  CHECK_CPU_INPUT(kernel_mask);\n  CHECK_CPU_INPUT(internal_kernel_label);\n  auto ptr_data = kernel_mask.data_ptr<uint8_t>();\n  IntArrayRef data_shape = kernel_mask.sizes();\n\n  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();\n  vector<vector<int>> text_line;\n\n  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,\n                min_kernel_area, text_line);\n\n  return text_line;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/convex_iou.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid convex_iou_impl(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious) {\n  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);\n}\n\nvoid convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {\n  convex_iou_impl(pointsets, polygons, ious);\n}\n\nvoid convex_giou_impl(const Tensor pointsets, const Tensor polygons,\n                      Tensor output) {\n  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);\n}\n\nvoid convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {\n  convex_giou_impl(pointsets, polygons, output);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/correlation.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <iostream>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW) {\n  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,\n                       patchH, patchW, padH, padW, dilationH, dilationW,\n                       dilation_patchH, dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW) {\n  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,\n                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,\n                       padW, dilationH, dilationW, dilation_patchH,\n                       dilation_patchW, dH, dW);\n}\n\nvoid correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,\n                         int kW, int patchH, int patchW, int padH, int padW,\n                         int dilationH, int dilationW, int dilation_patchH,\n                         int dilation_patchW, int dH, int dW) {\n  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,\n                           padW, dilationH, dilationW, dilation_patchH,\n                           dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,\n                          Tensor grad_input1, Tensor grad_input2, int kH,\n                          int kW, int patchH, int patchW, int padH, int padW,\n                          int dilationH, int dilationW, int dilation_patchH,\n                          int dilation_patchW, int dH, int dW) {\n  correlation_backward_impl(grad_output, input1, input2, grad_input1,\n                            grad_input2, kH, kW, patchH, patchW, padH, padW,\n                            dilationH, dilationW, dilation_patchH,\n                            dilation_patchW, dH, dW);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nvoid active_rotated_filter_forward_cpu_kernel(\n    const T* weightData, const int* indicesData, const int num_output_planes,\n    const int num_input_planes, const int num_orientations, const int kH,\n    const int kW, const int num_rotations, T* outputData) {\n  const int nEntry = num_orientations * kH * kW;\n  int i, j, l;\n  int k;\n\n#pragma omp parallel for private(i, j, l, k)\n  for (i = 0; i < num_output_planes; i++) {\n    for (j = 0; j < num_input_planes; j++) {\n      for (l = 0; l < nEntry; l++) {\n        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;\n        T val = *(weightData + weightIndex);\n        for (k = 0; k < num_rotations; k++) {\n          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;\n          T* target = outputData +\n                      i * (num_rotations * num_input_planes * nEntry) +\n                      k * (num_input_planes * nEntry) + j * (nEntry) + index;\n          *target = val;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid active_rotated_filter_backward_cpu_kernel(\n    const T* gradOutputData, const int* indicesData,\n    const int num_output_planes, const int num_input_planes,\n    const int num_orientations, const int kH, const int kW,\n    const int num_rotations, T* gradInputData) {\n  const int nEntry = num_orientations * kH * kW;\n  int i, j, l;\n  int k;\n\n#pragma omp parallel for private(i, j, l, k)\n  for (i = 0; i < num_output_planes; i++) {\n    for (j = 0; j < num_input_planes; j++) {\n      for (l = 0; l < nEntry; l++) {\n        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;\n        T* val = gradInputData + gradInputIndex;\n        *val = 0;\n        for (k = 0; k < num_rotations; k++) {\n          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;\n          const T* target =\n              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +\n              k * (num_input_planes * nEntry) + j * (nEntry) + index;\n          *val = *val + *target;\n        }\n      }\n    }\n  }\n}\n\nvoid ActiveRotatedFilterForwardCPULauncher(const Tensor input,\n                                           const Tensor indices,\n                                           Tensor output) {\n  const int num_output_planes = input.size(0);\n  const int num_input_planes = input.size(1);\n  const int num_orientations = input.size(2);\n  const int kH = input.size(3);\n  const int kW = input.size(4);\n  const int num_rotations = indices.size(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"active_rotated_filter_forward_cpu_kernel\", [&] {\n        active_rotated_filter_forward_cpu_kernel<scalar_t>(\n            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),\n            num_output_planes, num_input_planes, num_orientations, kH, kW,\n            num_rotations, output.data_ptr<scalar_t>());\n      });\n}\n\nvoid ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,\n                                            const Tensor indices,\n                                            Tensor grad_in) {\n  const int num_orientations = indices.size(0);\n  const int kH = indices.size(1);\n  const int kW = indices.size(2);\n  const int num_rotations = indices.size(3);\n  const int num_output_planes = grad_out.size(0) / num_rotations;\n  const int num_input_planes = grad_out.size(1) / num_orientations;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"active_rotated_filter_backward_cpu_kernel\", [&] {\n        active_rotated_filter_backward_cpu_kernel<scalar_t>(\n            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),\n            num_output_planes, num_input_planes, num_orientations, kH, kW,\n            num_rotations, grad_in.data_ptr<scalar_t>());\n      });\n}\n\nvoid active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,\n                                       Tensor output) {\n  ActiveRotatedFilterForwardCPULauncher(input, indices, output);\n}\n\nvoid active_rotated_filter_backward_cpu(const Tensor grad_out,\n                                        const Tensor indices, Tensor grad_in) {\n  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);\n}\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output);\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in);\n\nREGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,\n                     active_rotated_filter_forward_cpu);\nREGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,\n                     active_rotated_filter_backward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp",
    "content": "// Copyright(c) OpenMMLab.All rights reserved.\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nusing torch::indexing::None;\nusing torch::indexing::Slice;\n\nvoid bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,\n                              Tensor ious, const int mode_flag,\n                              const bool aligned, const int offset) {\n  Tensor temp_ious;\n  if (aligned) {\n    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),\n                           boxes2.index({Slice(None), Slice({None, 2})}));\n    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),\n                           boxes2.index({Slice(None), Slice(2)}));\n    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);\n    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});\n    Tensor area1 = (boxes1.index({Slice(None), 2}) -\n                    boxes1.index({Slice(None), 0}) + offset) *\n                   (boxes1.index({Slice(None), 3}) -\n                    boxes1.index({Slice(None), 1}) + offset);\n    if (mode_flag == 0) {\n      Tensor area2 = (boxes2.index({Slice(None), 2}) -\n                      boxes2.index({Slice(None), 0}) + offset) *\n                     (boxes2.index({Slice(None), 3}) -\n                      boxes2.index({Slice(None), 1}) + offset);\n      temp_ious = overlap / (area1 + area2 - overlap);\n    } else {\n      temp_ious = overlap / area1;\n    }\n  } else {\n    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),\n                           boxes2.index({Slice(None), Slice({None, 2})}));\n    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),\n                           boxes2.index({Slice(None), Slice(2)}));\n    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);\n    Tensor overlap = wh.index({\"...\", 0}) * wh.index({\"...\", 1});\n    Tensor area1 = (boxes1.index({Slice(None), 2}) -\n                    boxes1.index({Slice(None), 0}) + offset) *\n                   (boxes1.index({Slice(None), 3}) -\n                    boxes1.index({Slice(None), 1}) + offset);\n    if (mode_flag == 0) {\n      Tensor area2 = (boxes2.index({Slice(None), 2}) -\n                      boxes2.index({Slice(None), 0}) + offset) *\n                     (boxes2.index({Slice(None), 3}) -\n                      boxes2.index({Slice(None), 1}) + offset);\n      temp_ious =\n          overlap / (area1.index({Slice(None), None}) + area2 - overlap);\n    } else {\n      temp_ious = overlap / area1.index({Slice(None), None});\n    }\n  }\n  ious.copy_(temp_ious);\n}\n\nvoid bbox_overlaps_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                       const int mode, const bool aligned, const int offset) {\n  bbox_overlaps_cpu_kernel(boxes1, boxes2, ious, mode, aligned, offset);\n}\n\nvoid bbox_overlaps_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\n\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, CPU, bbox_overlaps_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp",
    "content": "// Modified from\n// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/BezierAlign\n// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include <ATen/ATen.h>\n#include <ATen/TensorUtils.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n// implementation taken from Caffe2\ntemplate <typename T>\nstruct PreCalc {\n  int pos1;\n  int pos2;\n  int pos3;\n  int pos4;\n  T w1;\n  T w2;\n  T w3;\n  T w4;\n};\n\ntemplate <typename T>\nT bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) {\n  return ((1. - u) * (1. - u) * (1. - u) * p0 +\n          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +\n          u * u * u * p3);\n}\n\ntemplate <typename T>\nvoid pre_calc_for_bilinear_interpolate(\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int iy_upper, const int ix_upper, T p0_x,\n    T p0_y, T p1_x, T p1_y, T p2_x, T p2_y, T p3_x, T p3_y, T p4_x, T p4_y,\n    T p5_x, T p5_y, T p6_x, T p6_y, T p7_x, T p7_y, T bin_size_h, T bin_size_w,\n    int roi_bin_grid_h, int roi_bin_grid_w, T offset,\n    std::vector<PreCalc<T>> &pre_calc) {\n  int pre_calc_index = 0;\n  for (int ph = 0; ph < pooled_height; ph++) {\n    for (int pw = 0; pw < pooled_width; pw++) {\n      // compute the coords\n      const T u = pw / static_cast<T>(pooled_width);\n      const T v = ph / static_cast<T>(pooled_height);\n      const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n      const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n      const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n      const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n      const T x_center = x1 * v + x0 * (1. - v) - offset;\n      const T y_center = y1 * v + y0 * (1. - v) - offset;\n      for (int iy = 0; iy < iy_upper; iy++) {\n        const T yy = y_center - (T)0.5 * bin_size_h +\n                     static_cast<T>(iy + .5f) * bin_size_h /\n                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n        for (int ix = 0; ix < ix_upper; ix++) {\n          const T xx = x_center - (T)0.5 * bin_size_w +\n                       static_cast<T>(ix + .5f) * bin_size_w /\n                           static_cast<T>(roi_bin_grid_w);\n\n          T x = xx;\n          T y = yy;\n          // deal with: inverse elements are out of feature map boundary\n          if (y < -1.0 || y > height || x < -1.0 || x > width) {\n            // empty\n            PreCalc<T> pc;\n            pc.pos1 = 0;\n            pc.pos2 = 0;\n            pc.pos3 = 0;\n            pc.pos4 = 0;\n            pc.w1 = 0;\n            pc.w2 = 0;\n            pc.w3 = 0;\n            pc.w4 = 0;\n            pre_calc[pre_calc_index] = pc;\n            pre_calc_index += 1;\n            continue;\n          }\n\n          if (y <= 0) {\n            y = 0;\n          }\n          if (x <= 0) {\n            x = 0;\n          }\n\n          int y_low = (int)y;\n          int x_low = (int)x;\n          int y_high;\n          int x_high;\n\n          if (y_low >= height - 1) {\n            y_high = y_low = height - 1;\n            y = (T)y_low;\n          } else {\n            y_high = y_low + 1;\n          }\n\n          if (x_low >= width - 1) {\n            x_high = x_low = width - 1;\n            x = (T)x_low;\n          } else {\n            x_high = x_low + 1;\n          }\n\n          T ly = y - y_low;\n          T lx = x - x_low;\n          T hy = 1. - ly, hx = 1. - lx;\n          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n          // save weights and indices\n          PreCalc<T> pc;\n          pc.pos1 = y_low * width + x_low;\n          pc.pos2 = y_low * width + x_high;\n          pc.pos3 = y_high * width + x_low;\n          pc.pos4 = y_high * width + x_high;\n          pc.w1 = w1;\n          pc.w2 = w2;\n          pc.w3 = w3;\n          pc.w4 = w4;\n          pre_calc[pre_calc_index] = pc;\n\n          pre_calc_index += 1;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid BezierAlignForward(const int nthreads, const T *input, const T *rois,\n                        T *output, const int pooled_height,\n                        const int pooled_width, const T &spatial_scale,\n                        const int sampling_ratio, bool aligned,\n                        const int channels, const int height, const int width) {\n  int n_rois = nthreads / channels / pooled_width / pooled_height;\n  // (n, c, ph, pw) is an element in the pooled output\n  // can be parallelized using omp\n  // #pragma omp parallel for num_threads(32)\n  for (int n = 0; n < n_rois; n++) {\n    int index_n = n * channels * pooled_width * pooled_height;\n\n    // beziers have size Nx(1+8*2) = Nx17\n    const T *offset_rois = rois + n * 17;\n    int roi_batch_ind = offset_rois[0];\n\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    // Do not use rounding; this implementation detail is critical\n    T p0_x = offset_rois[1] * spatial_scale;\n    T p0_y = offset_rois[2] * spatial_scale;\n    T p1_x = offset_rois[3] * spatial_scale;\n    T p1_y = offset_rois[4] * spatial_scale;\n    T p2_x = offset_rois[5] * spatial_scale;\n    T p2_y = offset_rois[6] * spatial_scale;\n    T p3_x = offset_rois[7] * spatial_scale;\n    T p3_y = offset_rois[8] * spatial_scale;\n    T p4_x = offset_rois[15] * spatial_scale;\n    T p4_y = offset_rois[16] * spatial_scale;\n    T p5_x = offset_rois[13] * spatial_scale;\n    T p5_y = offset_rois[14] * spatial_scale;\n    T p6_x = offset_rois[11] * spatial_scale;\n    T p6_y = offset_rois[12] * spatial_scale;\n    T p7_x = offset_rois[9] * spatial_scale;\n    T p7_y = offset_rois[10] * spatial_scale;\n\n    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));\n    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"Beziers in BezierAlign cannot have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    // When the grid is empty, output zeros == 0/1, instead of NaN.\n    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    // we want to precalculate indices and weights shared by all channels,\n    // this is the key point of optimization\n    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *\n                                     pooled_width * pooled_height);\n    pre_calc_for_bilinear_interpolate(\n        height, width, pooled_height, pooled_width, roi_bin_grid_h,\n        roi_bin_grid_w, p0_x, p0_y, p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x,\n        p4_y, p5_x, p5_y, p6_x, p6_y, p7_x, p7_y, bin_size_h, bin_size_w,\n        roi_bin_grid_h, roi_bin_grid_w, offset, pre_calc);\n\n    for (int c = 0; c < channels; c++) {\n      int index_n_c = index_n + c * pooled_width * pooled_height;\n      const T *offset_input =\n          input + (roi_batch_ind * channels + c) * height * width;\n      int pre_calc_index = 0;\n\n      for (int ph = 0; ph < pooled_height; ph++) {\n        for (int pw = 0; pw < pooled_width; pw++) {\n          int index = index_n_c + ph * pooled_width + pw;\n\n          T output_val = 0.;\n          for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n            for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n              PreCalc<T> pc = pre_calc[pre_calc_index];\n              output_val += pc.w1 * offset_input[pc.pos1] +\n                            pc.w2 * offset_input[pc.pos2] +\n                            pc.w3 * offset_input[pc.pos3] +\n                            pc.w4 * offset_input[pc.pos4];\n\n              pre_calc_index += 1;\n            }\n          }\n          output_val /= count;\n\n          output[index] = output_val;\n        }  // for pw\n      }  // for ph\n    }  // for c\n  }  // for n\n}\n\ntemplate <typename T>\nvoid bilinear_interpolate_gradient(const int height, const int width, T y, T x,\n                                   T &w1, T &w2, T &w3, T &w4, int &x_low,\n                                   int &x_high, int &y_low, int &y_high,\n                                   const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  // reference in forward\n  // T v1 = input[y_low * width + x_low];\n  // T v2 = input[y_low * width + x_high];\n  // T v3 = input[y_high * width + x_low];\n  // T v4 = input[y_high * width + x_high];\n  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n}\n\ntemplate <class T>\ninline void add(T *address, const T &val) {\n  *address += val;\n}\n\ntemplate <typename T>\nvoid BezierAlignBackward(const int nthreads, const T *grad_output,\n                         const T *rois, T *grad_input, const int pooled_height,\n                         const int pooled_width, const T &spatial_scale,\n                         const int sampling_ratio, bool aligned,\n                         const int channels, const int height, const int width,\n                         const int n_stride, const int c_stride,\n                         const int h_stride, const int w_stride) {\n  for (int index = 0; index < nthreads; index++) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T *offset_rois = rois + n * 17;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T p0_x = offset_rois[1] * spatial_scale;\n    T p0_y = offset_rois[2] * spatial_scale;\n    T p1_x = offset_rois[3] * spatial_scale;\n    T p1_y = offset_rois[4] * spatial_scale;\n    T p2_x = offset_rois[5] * spatial_scale;\n    T p2_y = offset_rois[6] * spatial_scale;\n    T p3_x = offset_rois[7] * spatial_scale;\n    T p3_y = offset_rois[8] * spatial_scale;\n    T p4_x = offset_rois[15] * spatial_scale;\n    T p4_y = offset_rois[16] * spatial_scale;\n    T p5_x = offset_rois[13] * spatial_scale;\n    T p5_y = offset_rois[14] * spatial_scale;\n    T p6_x = offset_rois[11] * spatial_scale;\n    T p6_y = offset_rois[12] * spatial_scale;\n    T p7_x = offset_rois[9] * spatial_scale;\n    T p7_y = offset_rois[10] * spatial_scale;\n\n    // compute the coords\n    const T u = pw / static_cast<T>(pooled_width);\n    const T v = ph / static_cast<T>(pooled_height);\n    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);\n    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);\n    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);\n    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);\n    const T x_center = x1 * v + x0 * (1. - v) - offset;\n    const T y_center = y1 * v + y0 * (1. - v) - offset;\n\n    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));\n    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"Beziers in BezierAlign do not have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    T *offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    int output_offset = n * n_stride + c * c_stride;\n    const T *offset_grad_output = grad_output + output_offset;\n    const T grad_output_this_bin =\n        offset_grad_output[ph * h_stride + pw * w_stride];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceil(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T y = y_center - (T)0.5 * bin_size_h +\n                  static_cast<T>(iy + .5f) * bin_size_h /\n                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T x = x_center - (T)0.5 * bin_size_w +\n                    static_cast<T>(ix + .5f) * bin_size_w /\n                        static_cast<T>(roi_bin_grid_w);\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        T g1 = grad_output_this_bin * w1 / count;\n        T g2 = grad_output_this_bin * w2 / count;\n        T g3 = grad_output_this_bin * w3 / count;\n        T g4 = grad_output_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          // atomic add is not needed for now since it is single threaded\n          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));\n          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));\n          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));\n          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));\n        }  // if\n      }  // ix\n    }  // iy\n  }  // for\n}  // BezierAlignBackward\n\nvoid BezierAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,\n                                   int aligned_height, int aligned_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"BezierAlign_forward\", [&] {\n        BezierAlignForward<scalar_t>(\n            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),\n            output.data_ptr<scalar_t>(), aligned_height, aligned_width,\n            static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,\n            channels, height, width);\n      });\n}\n\nvoid BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,\n                                    Tensor grad_input, int aligned_height,\n                                    int aligned_width, float spatial_scale,\n                                    int sampling_ratio, bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  // get stride values to ensure indexing into gradients is correct.\n  int n_stride = grad_output.stride(0);\n  int c_stride = grad_output.stride(1);\n  int h_stride = grad_output.stride(2);\n  int w_stride = grad_output.stride(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"BezierAlign_backward\", [&] {\n        BezierAlignBackward<scalar_t>(\n            output_size, grad_output.data_ptr<scalar_t>(),\n            rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),\n            sampling_ratio, aligned, channels, height, width, n_stride,\n            c_stride, h_stride, w_stride);\n      });\n}\n\nvoid bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned);\n\nvoid bezier_align_backward_impl(Tensor grad_output, Tensor rois,\n                                Tensor grad_input, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned);\n\nREGISTER_DEVICE_IMPL(bezier_align_forward_impl, CPU,\n                     BezierAlignForwardCPULauncher);\nREGISTER_DEVICE_IMPL(bezier_align_backward_impl, CPU,\n                     BezierAlignBackwardCPULauncher);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"box_iou_rotated_utils.hpp\"\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nvoid box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,\n                               Tensor ious, const int mode_flag,\n                               const bool aligned) {\n  int output_size = ious.numel();\n  auto num_boxes1 = boxes1.size(0);\n  auto num_boxes2 = boxes2.size(0);\n\n  if (aligned) {\n    for (int i = 0; i < output_size; i++) {\n      ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),\n                                         boxes2[i].data_ptr<T>(), mode_flag);\n    }\n  } else {\n    for (int i = 0; i < num_boxes1; i++) {\n      for (int j = 0; j < num_boxes2; j++) {\n        ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(\n            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);\n      }\n    }\n  }\n}\n\nvoid box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                        const int mode_flag, const bool aligned) {\n  box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);\n}\n\nvoid box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp\n#include \"box_iou_rotated_utils.hpp\"\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nvoid box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,\n                                Tensor ious, const int mode_flag,\n                                const bool aligned) {\n  int output_size = ious.numel();\n  auto num_boxes1 = boxes1.size(0);\n  auto num_boxes2 = boxes2.size(0);\n\n  if (aligned) {\n    for (int i = 0; i < output_size; i++) {\n      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),\n                                          boxes2[i].data_ptr<T>(), mode_flag);\n    }\n  } else {\n    for (int i = 0; i < num_boxes1; i++) {\n      for (int j = 0; j < num_boxes2; j++) {\n        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(\n            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);\n      }\n    }\n  }\n}\n\nvoid box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);\n}\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nT deformable_im2col_bilinear_cpu(const T *input, const int data_width,\n                                 const int height, const int width, T h, T w) {\n  if (h <= -1 || height <= h || w <= -1 || width <= w) {\n    return 0;\n  }\n\n  int h_low = floor(h);\n  int w_low = floor(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\nT get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,\n                          const int height, const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floor(argmax_h);\n  int argmax_w_low = floor(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\nT get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,\n                            const int width, const T *im_data,\n                            const int data_width, const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floor(argmax_h);\n  int argmax_w_low = floor(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\nvoid deformable_im2col_cpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const int height,\n    const int width, const int kernel_h, const int kernel_w, const int pad_h,\n    const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  for (int index = 0; index < n; index++) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,\n                                               width, h_im, w_im);\n        *data_col_ptr = val;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid deformable_col2im_cpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const int channels,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  for (int index = 0; index < n; index++) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index];\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight =\n              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,\n                                      cur_h + dy, cur_w + dx, height, width);\n          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid deformable_col2im_coord_cpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int offset_channels, const int deformable_group, const int height_col,\n    const int width_col, T *grad_offset) {\n  for (int index = 0; index < n; index++) {\n    T val = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      const T weight = get_coordinate_weight_cpu(\n          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,\n          width, bp_dir);\n      val += weight * data_col_ptr[col_pos];\n      cnt += 1;\n    }\n\n    grad_offset[index] = val;\n  }\n}\n\nvoid deformable_im2col_cpu(Tensor data_im, Tensor data_offset,\n                           const int channels, const int height,\n                           const int width, const int ksize_h,\n                           const int ksize_w, const int pad_h, const int pad_w,\n                           const int stride_h, const int stride_w,\n                           const int dilation_h, const int dilation_w,\n                           const int parallel_imgs, const int deformable_group,\n                           Tensor data_col) {\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = channels * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"deformable_im2col_cpu\", [&] {\n        deformable_im2col_cpu_kernel<scalar_t>(\n            num_kernels, data_im.data_ptr<scalar_t>(),\n            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,\n            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, parallel_imgs, channels,\n            deformable_group, height_col, width_col,\n            data_col.data_ptr<scalar_t>());\n      });\n}\n\nvoid deformable_col2im_cpu(Tensor data_col, Tensor data_offset,\n                           const int channels, const int height,\n                           const int width, const int ksize_h,\n                           const int ksize_w, const int pad_h, const int pad_w,\n                           const int stride_h, const int stride_w,\n                           const int dilation_h, const int dilation_w,\n                           const int parallel_imgs, const int deformable_group,\n                           Tensor grad_im) {\n  // todo: make sure parallel_imgs is passed in correctly\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels =\n      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        deformable_col2im_cpu_kernel<scalar_t>(\n            num_kernels, data_col_, data_offset_, channels, height, width,\n            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,\n            dilation_w, channel_per_deformable_group, parallel_imgs,\n            deformable_group, height_col, width_col, grad_im_);\n      }));\n}\n\nvoid deformable_col2im_coord_cpu(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset) {\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *\n                    deformable_group * parallel_imgs;\n  int channel_per_deformable_group =\n      channels * ksize_h * ksize_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_coord_cpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n\n        deformable_col2im_coord_cpu_kernel<scalar_t>(\n            num_kernels, data_col_, data_im_, data_offset_, channels, height,\n            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,\n            2 * ksize_h * ksize_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_);\n      }));\n}\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nREGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);\nREGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);\nREGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,\n                     deformable_col2im_coord_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nT dmcn_im2col_bilinear_cpu(const T *input, const int data_width,\n                           const int height, const int width, T h, T w) {\n  int h_low = floorf(h);\n  int w_low = floorf(w);\n  int h_high = h_low + 1;\n  int w_high = w_low + 1;\n\n  T lh = h - h_low;\n  T lw = w - w_low;\n  T hh = 1 - lh, hw = 1 - lw;\n\n  T v1 = 0;\n  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];\n  T v2 = 0;\n  if (h_low >= 0 && w_high <= width - 1)\n    v2 = input[h_low * data_width + w_high];\n  T v3 = 0;\n  if (h_high <= height - 1 && w_low >= 0)\n    v3 = input[h_high * data_width + w_low];\n  T v4 = 0;\n  if (h_high <= height - 1 && w_high <= width - 1)\n    v4 = input[h_high * data_width + w_high];\n\n  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;\n\n  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n  return val;\n}\n\ntemplate <typename T>\nT dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,\n                               const int height, const int width) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n  if (h == argmax_h_low && w == argmax_w_low)\n    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);\n  if (h == argmax_h_low && w == argmax_w_high)\n    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);\n  if (h == argmax_h_high && w == argmax_w_low)\n    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);\n  if (h == argmax_h_high && w == argmax_w_high)\n    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);\n  return weight;\n}\n\ntemplate <typename T>\nT dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,\n                                 const int width, const T *im_data,\n                                 const int data_width, const int bp_dir) {\n  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||\n      argmax_w >= width) {\n    // empty\n    return 0;\n  }\n\n  int argmax_h_low = floorf(argmax_h);\n  int argmax_w_low = floorf(argmax_w);\n  int argmax_h_high = argmax_h_low + 1;\n  int argmax_w_high = argmax_w_low + 1;\n\n  T weight = 0;\n\n  if (bp_dir == 0) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += -1 * (argmax_w - argmax_w_low) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += (argmax_w_low + 1 - argmax_w) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_w - argmax_w_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  } else if (bp_dir == 1) {\n    if (argmax_h_low >= 0 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_low];\n    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)\n      weight += (argmax_h_low + 1 - argmax_h) *\n                im_data[argmax_h_low * data_width + argmax_w_high];\n    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)\n      weight += -1 * (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_low];\n    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)\n      weight += (argmax_h - argmax_h_low) *\n                im_data[argmax_h_high * data_width + argmax_w_high];\n  }\n\n  return weight;\n}\n\ntemplate <typename T>\nvoid modulated_deformable_im2col_cpu_kernel(\n    const int n, const T *data_im, const T *data_offset, const T *data_mask,\n    const int height, const int width, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int num_channels, const int deformable_group, const int height_col,\n    const int width_col, T *data_col) {\n  for (int index = 0; index < n; index++) {\n    // index index of output matrix\n    const int w_col = index % width_col;\n    const int h_col = (index / width_col) % height_col;\n    const int b_col = (index / width_col / height_col) % batch_size;\n    const int c_im = (index / width_col / height_col) / batch_size;\n    const int c_col = c_im * kernel_h * kernel_w;\n\n    // compute deformable group index\n    const int deformable_group_index = c_im / channel_per_deformable_group;\n\n    const int h_in = h_col * stride_h - pad_h;\n    const int w_in = w_col * stride_w - pad_w;\n\n    T *data_col_ptr =\n        data_col +\n        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;\n    const T *data_im_ptr =\n        data_im + (b_col * num_channels + c_im) * height * width;\n    const T *data_offset_ptr =\n        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n\n    const T *data_mask_ptr =\n        data_mask + (b_col * deformable_group + deformable_group_index) *\n                        kernel_h * kernel_w * height_col * width_col;\n\n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 0; j < kernel_w; ++j) {\n        const int data_offset_h_ptr =\n            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;\n        const int data_offset_w_ptr =\n            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +\n            w_col;\n        const int data_mask_hw_ptr =\n            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;\n        const T offset_h = data_offset_ptr[data_offset_h_ptr];\n        const T offset_w = data_offset_ptr[data_offset_w_ptr];\n        const T mask = data_mask_ptr[data_mask_hw_ptr];\n        T val = static_cast<T>(0);\n        const T h_im = h_in + i * dilation_h + offset_h;\n        const T w_im = w_in + j * dilation_w + offset_w;\n        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)\n          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,\n                                         h_im, w_im);\n        *data_col_ptr = val * mask;\n        data_col_ptr += batch_size * height_col * width_col;\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid modulated_deformable_col2im_cpu_kernel(\n    const int n, const T *data_col, const T *data_offset, const T *data_mask,\n    const int channels, const int height, const int width, const int kernel_h,\n    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,\n    const int stride_w, const int dilation_h, const int dilation_w,\n    const int channel_per_deformable_group, const int batch_size,\n    const int deformable_group, const int height_col, const int width_col,\n    T *grad_im) {\n  for (int index = 0; index < n; index++) {\n    const int j = (index / width_col / height_col / batch_size) % kernel_w;\n    const int i =\n        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;\n    const int c =\n        index / width_col / height_col / batch_size / kernel_w / kernel_h;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / channel_per_deformable_group;\n\n    int w_out = index % width_col;\n    int h_out = (index / width_col) % height_col;\n    int b = (index / width_col / height_col) % batch_size;\n    int w_in = w_out * stride_w - pad_w;\n    int h_in = h_out * stride_h - pad_h;\n\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n    const int data_offset_h_ptr =\n        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;\n    const int data_offset_w_ptr =\n        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;\n    const int data_mask_hw_ptr =\n        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;\n    const T offset_h = data_offset_ptr[data_offset_h_ptr];\n    const T offset_w = data_offset_ptr[data_offset_w_ptr];\n    const T mask = data_mask_ptr[data_mask_hw_ptr];\n    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;\n    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;\n\n    const T cur_top_grad = data_col[index] * mask;\n    const int cur_h = (int)cur_inv_h_data;\n    const int cur_w = (int)cur_inv_w_data;\n    for (int dy = -2; dy <= 2; dy++) {\n      for (int dx = -2; dx <= 2; dx++) {\n        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&\n            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&\n            abs(cur_inv_w_data - (cur_w + dx)) < 1) {\n          int cur_bottom_grad_pos =\n              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;\n          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,\n                                                  cur_inv_w_data, cur_h + dy,\n                                                  cur_w + dx, height, width);\n          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid modulated_deformable_col2im_coord_cpu_kernel(\n    const int n, const T *data_col, const T *data_im, const T *data_offset,\n    const T *data_mask, const int channels, const int height, const int width,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int channel_per_deformable_group,\n    const int batch_size, const int offset_channels, const int deformable_group,\n    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {\n  for (int index = 0; index < n; index++) {\n    T val = 0, mval = 0;\n    int w = index % width_col;\n    int h = (index / width_col) % height_col;\n    int c = (index / width_col / height_col) % offset_channels;\n    int b = (index / width_col / height_col) / offset_channels;\n    // compute the start and end of the output\n\n    const int deformable_group_index = c / (2 * kernel_h * kernel_w);\n    const int col_step = kernel_h * kernel_w;\n    int cnt = 0;\n    const T *data_col_ptr = data_col + deformable_group_index *\n                                           channel_per_deformable_group *\n                                           batch_size * width_col * height_col;\n    const T *data_im_ptr =\n        data_im + (b * deformable_group + deformable_group_index) *\n                      channel_per_deformable_group / kernel_h / kernel_w *\n                      height * width;\n    const T *data_offset_ptr =\n        data_offset + (b * deformable_group + deformable_group_index) * 2 *\n                          kernel_h * kernel_w * height_col * width_col;\n    const T *data_mask_ptr =\n        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *\n                        kernel_w * height_col * width_col;\n\n    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;\n\n    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;\n         col_c += col_step) {\n      const int col_pos =\n          (((col_c * batch_size + b) * height_col) + h) * width_col + w;\n      const int bp_dir = offset_c % 2;\n\n      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;\n      int i =\n          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;\n      int w_out = col_pos % width_col;\n      int h_out = (col_pos / width_col) % height_col;\n      int w_in = w_out * stride_w - pad_w;\n      int h_in = h_out * stride_h - pad_h;\n      const int data_offset_h_ptr =\n          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);\n      const int data_offset_w_ptr =\n          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +\n           w_out);\n      const int data_mask_hw_ptr =\n          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);\n      const T offset_h = data_offset_ptr[data_offset_h_ptr];\n      const T offset_w = data_offset_ptr[data_offset_w_ptr];\n      const T mask = data_mask_ptr[data_mask_hw_ptr];\n      T inv_h = h_in + i * dilation_h + offset_h;\n      T inv_w = w_in + j * dilation_w + offset_w;\n      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)\n        inv_h = inv_w = -2;\n      else\n        mval += data_col_ptr[col_pos] *\n                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,\n                                         width, height, width, inv_h, inv_w);\n      const T weight = dmcn_get_coordinate_weight_cpu(\n          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,\n          width, bp_dir);\n      val += weight * data_col_ptr[col_pos] * mask;\n      cnt += 1;\n    }\n    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);\n    grad_offset[index] = val;\n    if (offset_c % 2 == 0)\n      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +\n      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *\n      // height_col + h) * width_col + w], mask_req, mval);\n      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *\n                      kernel_w +\n                  offset_c / 2) *\n                     height_col +\n                 h) *\n                    width_col +\n                w] = mval;\n  }\n}\n\nvoid modulated_deformable_im2col_cpu(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col) {\n  // num_axes should be smaller than block size\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels = channels * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"modulated_deformable_im2col_cpu\", ([&] {\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n\n        modulated_deformable_im2col_cpu_kernel(\n            num_kernels, data_im_, data_offset_, data_mask_, height_im,\n            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, batch_size,\n            channels, deformable_group, height_col, width_col, data_col_);\n      }));\n}\n\nvoid modulated_deformable_col2im_cpu(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im) {\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels =\n      channels * kernel_h * kernel_w * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_cpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_cpu_kernel(\n            num_kernels, data_col_, data_offset_, data_mask_, channels,\n            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,\n            stride_w, dilation_h, dilation_w, channel_per_deformable_group,\n            batch_size, deformable_group, height_col, width_col, grad_im_);\n      }));\n}\n\nvoid modulated_deformable_col2im_coord_cpu(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask) {\n  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *\n                          kernel_w * deformable_group;\n  const int channel_per_deformable_group =\n      channels * kernel_h * kernel_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_coord_cpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_coord_cpu_kernel(\n            num_kernels, data_col_, data_im_, data_offset_, data_mask_,\n            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,\n            stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, batch_size,\n            2 * kernel_h * kernel_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_, grad_mask_);\n      }));\n}\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nREGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,\n                     modulated_deformable_im2col_cpu);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,\n                     modulated_deformable_col2im_cpu);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,\n                     modulated_deformable_col2im_coord_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/nms.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n  auto x1_t = boxes.select(1, 0).contiguous();\n  auto y1_t = boxes.select(1, 1).contiguous();\n  auto x2_t = boxes.select(1, 2).contiguous();\n  auto y2_t = boxes.select(1, 3).contiguous();\n\n  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);\n\n  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));\n\n  auto nboxes = boxes.size(0);\n  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));\n\n  auto select = select_t.data_ptr<bool>();\n  auto order = order_t.data_ptr<int64_t>();\n  auto x1 = x1_t.data_ptr<float>();\n  auto y1 = y1_t.data_ptr<float>();\n  auto x2 = x2_t.data_ptr<float>();\n  auto y2 = y2_t.data_ptr<float>();\n  auto areas = areas_t.data_ptr<float>();\n\n  for (int64_t _i = 0; _i < nboxes; _i++) {\n    if (select[_i] == false) continue;\n    auto i = order[_i];\n    auto ix1 = x1[i];\n    auto iy1 = y1[i];\n    auto ix2 = x2[i];\n    auto iy2 = y2[i];\n    auto iarea = areas[i];\n\n    for (int64_t _j = _i + 1; _j < nboxes; _j++) {\n      if (select[_j] == false) continue;\n      auto j = order[_j];\n      auto xx1 = std::max(ix1, x1[j]);\n      auto yy1 = std::max(iy1, y1[j]);\n      auto xx2 = std::min(ix2, x2[j]);\n      auto yy2 = std::min(iy2, y2[j]);\n\n      auto w = std::max(0.f, xx2 - xx1 + offset);\n      auto h = std::max(0.f, yy2 - yy1 + offset);\n      auto inter = w * h;\n      auto ovr = inter / (iarea + areas[j] - inter);\n      if (ovr > iou_threshold) select[_j] = false;\n    }\n  }\n  return order_t.masked_select(select_t);\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\nREGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);\n\nTensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,\n                   float iou_threshold, float sigma, float min_score,\n                   int method, int offset) {\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n\n  auto x1_t = boxes.select(1, 0).contiguous();\n  auto y1_t = boxes.select(1, 1).contiguous();\n  auto x2_t = boxes.select(1, 2).contiguous();\n  auto y2_t = boxes.select(1, 3).contiguous();\n  auto scores_t = scores.clone();\n\n  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);\n\n  auto nboxes = boxes.size(0);\n  auto x1 = x1_t.data_ptr<float>();\n  auto y1 = y1_t.data_ptr<float>();\n  auto x2 = x2_t.data_ptr<float>();\n  auto y2 = y2_t.data_ptr<float>();\n  auto sc = scores_t.data_ptr<float>();\n  auto areas = areas_t.data_ptr<float>();\n  auto de = dets.data_ptr<float>();\n\n  int64_t pos = 0;\n  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));\n  auto inds = inds_t.data_ptr<int64_t>();\n\n  for (int64_t i = 0; i < nboxes; i++) {\n    auto max_score = sc[i];\n    auto max_pos = i;\n\n    pos = i + 1;\n    // get max box\n    while (pos < nboxes) {\n      if (max_score < sc[pos]) {\n        max_score = sc[pos];\n        max_pos = pos;\n      }\n      pos = pos + 1;\n    }\n    // swap\n    auto ix1 = de[i * 5 + 0] = x1[max_pos];\n    auto iy1 = de[i * 5 + 1] = y1[max_pos];\n    auto ix2 = de[i * 5 + 2] = x2[max_pos];\n    auto iy2 = de[i * 5 + 3] = y2[max_pos];\n    auto iscore = de[i * 5 + 4] = sc[max_pos];\n    auto iarea = areas[max_pos];\n    auto iind = inds[max_pos];\n    x1[max_pos] = x1[i];\n    y1[max_pos] = y1[i];\n    x2[max_pos] = x2[i];\n    y2[max_pos] = y2[i];\n    sc[max_pos] = sc[i];\n    areas[max_pos] = areas[i];\n    inds[max_pos] = inds[i];\n    x1[i] = ix1;\n    y1[i] = iy1;\n    x2[i] = ix2;\n    y2[i] = iy2;\n    sc[i] = iscore;\n    areas[i] = iarea;\n    inds[i] = iind;\n\n    pos = i + 1;\n    while (pos < nboxes) {\n      auto xx1 = std::max(ix1, x1[pos]);\n      auto yy1 = std::max(iy1, y1[pos]);\n      auto xx2 = std::min(ix2, x2[pos]);\n      auto yy2 = std::min(iy2, y2[pos]);\n\n      auto w = std::max(0.f, xx2 - xx1 + offset);\n      auto h = std::max(0.f, yy2 - yy1 + offset);\n      auto inter = w * h;\n      auto ovr = inter / (iarea + areas[pos] - inter);\n\n      float weight = 1.;\n      if (method == 0) {\n        if (ovr >= iou_threshold) weight = 0;\n      } else if (method == 1) {\n        if (ovr >= iou_threshold) weight = 1 - ovr;\n      } else if (method == 2) {\n        weight = std::exp(-(ovr * ovr) / sigma);\n      }\n      sc[pos] *= weight;\n      // if box score falls below threshold, discard the box by\n      // swapping with last box update N\n      if (sc[pos] < min_score) {\n        x1[pos] = x1[nboxes - 1];\n        y1[pos] = y1[nboxes - 1];\n        x2[pos] = x2[nboxes - 1];\n        y2[pos] = y2[nboxes - 1];\n        sc[pos] = sc[nboxes - 1];\n        areas[pos] = areas[nboxes - 1];\n        inds[pos] = inds[nboxes - 1];\n        nboxes = nboxes - 1;\n        pos = pos - 1;\n      }\n      pos = pos + 1;\n    }\n  }\n  return inds_t.slice(0, 0, nboxes);\n}\n\nTensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,\n                    float iou_threshold, float sigma, float min_score,\n                    int method, int offset);\nREGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);\n\nstd::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {\n  auto x1_t = dets.select(1, 0).contiguous();\n  auto y1_t = dets.select(1, 1).contiguous();\n  auto x2_t = dets.select(1, 2).contiguous();\n  auto y2_t = dets.select(1, 3).contiguous();\n  auto scores = dets.select(1, 4).contiguous();\n\n  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);\n\n  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));\n\n  auto ndets = dets.size(0);\n  at::Tensor suppressed_t =\n      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));\n\n  auto suppressed = suppressed_t.data_ptr<uint8_t>();\n  auto order = order_t.data_ptr<int64_t>();\n  auto x1 = x1_t.data_ptr<float>();\n  auto y1 = y1_t.data_ptr<float>();\n  auto x2 = x2_t.data_ptr<float>();\n  auto y2 = y2_t.data_ptr<float>();\n  auto areas = areas_t.data_ptr<float>();\n\n  std::vector<int> keep;\n  std::vector<std::vector<int> > matched;\n\n  for (int64_t _i = 0; _i < ndets; _i++) {\n    auto i = order[_i];\n    if (suppressed[i] == 1) continue;\n    keep.push_back(i);\n    std::vector<int> v_i;\n    auto ix1 = x1[i];\n    auto iy1 = y1[i];\n    auto ix2 = x2[i];\n    auto iy2 = y2[i];\n    auto iarea = areas[i];\n\n    for (int64_t _j = _i + 1; _j < ndets; _j++) {\n      auto j = order[_j];\n      if (suppressed[j] == 1) continue;\n      auto xx1 = std::max(ix1, x1[j]);\n      auto yy1 = std::max(iy1, y1[j]);\n      auto xx2 = std::min(ix2, x2[j]);\n      auto yy2 = std::min(iy2, y2[j]);\n\n      auto w = std::max(static_cast<float>(0), xx2 - xx1);\n      auto h = std::max(static_cast<float>(0), yy2 - yy1);\n      auto inter = w * h;\n      auto ovr = inter / (iarea + areas[j] - inter);\n      if (ovr >= iou_threshold) {\n        suppressed[j] = 1;\n        v_i.push_back(j);\n      }\n    }\n    matched.push_back(v_i);\n  }\n  for (size_t i = 0; i < keep.size(); i++)\n    matched[i].insert(matched[i].begin(), keep[i]);\n  return matched;\n}\n\nstd::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);\nREGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"box_iou_rotated_utils.hpp\"\n#include \"pytorch_cpp_helper.hpp\"\n\ntemplate <typename scalar_t>\nTensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,\n                             const float iou_threshold) {\n  // nms_quadri_cpu_kernel is modified from torchvision's nms_cpu_kernel,\n  // however, the code in this function is much shorter because\n  // we delegate the IoU computation for quadri boxes to\n  // the single_box_iou_quadri function in box_iou_rotated_utils.h\n  AT_ASSERTM(!dets.is_cuda(), \"dets must be a CPU tensor\");\n  AT_ASSERTM(!scores.is_cuda(), \"scores must be a CPU tensor\");\n  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),\n             \"dets should have the same type as scores\");\n\n  if (dets.numel() == 0) {\n    return at::empty({0}, dets.options().dtype(at::kLong));\n  }\n\n  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));\n\n  auto ndets = dets.size(0);\n  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));\n  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));\n\n  auto suppressed = suppressed_t.data_ptr<uint8_t>();\n  auto keep = keep_t.data_ptr<int64_t>();\n  auto order = order_t.data_ptr<int64_t>();\n\n  int64_t num_to_keep = 0;\n\n  for (int64_t _i = 0; _i < ndets; _i++) {\n    auto i = order[_i];\n    if (suppressed[i] == 1) {\n      continue;\n    }\n\n    keep[num_to_keep++] = i;\n\n    for (int64_t _j = _i + 1; _j < ndets; _j++) {\n      auto j = order[_j];\n      if (suppressed[j] == 1) {\n        continue;\n      }\n\n      auto ovr = single_box_iou_quadri<scalar_t>(\n          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);\n      if (ovr >= iou_threshold) {\n        suppressed[j] = 1;\n      }\n    }\n  }\n  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);\n}\n\nTensor nms_quadri_cpu(const Tensor dets, const Tensor scores,\n                      const float iou_threshold) {\n  auto result = at::empty({0}, dets.options());\n  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), \"nms_quadri\", [&] {\n    result = nms_quadri_cpu_kernel<scalar_t>(dets, scores, iou_threshold);\n  });\n  return result;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp\n#include \"box_iou_rotated_utils.hpp\"\n#include \"pytorch_cpp_helper.hpp\"\n\ntemplate <typename scalar_t>\nTensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,\n                              const float iou_threshold) {\n  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,\n  // however, the code in this function is much shorter because\n  // we delegate the IoU computation for rotated boxes to\n  // the single_box_iou_rotated function in box_iou_rotated_utils.h\n  AT_ASSERTM(!dets.is_cuda(), \"dets must be a CPU tensor\");\n  AT_ASSERTM(!scores.is_cuda(), \"scores must be a CPU tensor\");\n  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),\n             \"dets should have the same type as scores\");\n\n  if (dets.numel() == 0) {\n    return at::empty({0}, dets.options().dtype(at::kLong));\n  }\n\n  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));\n\n  auto ndets = dets.size(0);\n  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));\n  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));\n\n  auto suppressed = suppressed_t.data_ptr<uint8_t>();\n  auto keep = keep_t.data_ptr<int64_t>();\n  auto order = order_t.data_ptr<int64_t>();\n\n  int64_t num_to_keep = 0;\n\n  for (int64_t _i = 0; _i < ndets; _i++) {\n    auto i = order[_i];\n    if (suppressed[i] == 1) {\n      continue;\n    }\n\n    keep[num_to_keep++] = i;\n\n    for (int64_t _j = _i + 1; _j < ndets; _j++) {\n      auto j = order[_j];\n      if (suppressed[j] == 1) {\n        continue;\n      }\n\n      auto ovr = single_box_iou_rotated<scalar_t>(\n          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);\n      if (ovr >= iou_threshold) {\n        suppressed[j] = 1;\n      }\n    }\n  }\n  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);\n}\n\nTensor nms_rotated_cpu(const Tensor dets, const Tensor scores,\n                       const float iou_threshold) {\n  auto result = at::empty({0}, dets.options());\n  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), \"nms_rotated\", [&] {\n    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);\n  });\n  return result;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// It is modified from https://github.com/WenmuZhou/PAN.pytorch\n\n#include <queue>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nstd::vector<std::vector<float>> estimate_confidence(int32_t* label,\n                                                    float* score, int label_num,\n                                                    int height, int width) {\n  std::vector<std::vector<float>> point_vector;\n  for (int i = 0; i < label_num; i++) {\n    std::vector<float> point;\n    point.push_back(0);\n    point.push_back(0);\n    point_vector.push_back(point);\n  }\n  for (int y = 0; y < height; y++) {\n    auto label_tmp = label + y * width;\n    auto score_tmp = score + y * width;\n    for (int x = 0; x < width; x++) {\n      auto l = label_tmp[x];\n      if (l > 0) {\n        float confidence = score_tmp[x];\n        point_vector[l].push_back(x);\n        point_vector[l].push_back(y);\n        point_vector[l][0] += confidence;\n        point_vector[l][1] += 1;\n      }\n    }\n  }\n  for (size_t l = 0; l < point_vector.size(); l++)\n    if (point_vector[l][1] > 0) {\n      point_vector[l][0] /= point_vector[l][1];\n    }\n  return point_vector;\n}\nstd::vector<std::vector<float>> pixel_group_cpu(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {\n  assert(score.dim() == 2);\n  assert(mask.dim() == 2);\n  assert(embedding.dim() == 3);\n  int height = score.size(0);\n  int width = score.size(1);\n  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));\n  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));\n\n  auto threshold_square = dis_threshold * dis_threshold;\n  auto ptr_score = score.data_ptr<float>();\n  auto ptr_mask = mask.data_ptr<bool>();\n  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();\n  auto ptr_embedding = embedding.data_ptr<float>();\n  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();\n  std::queue<std::tuple<int, int, int32_t>> contour_pixels;\n  auto embedding_dim = embedding.size(2);\n  std::vector<std::vector<float>> kernel_vector(\n      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));\n\n  Tensor text_label;\n  text_label = kernel_label.clone();\n  auto ptr_text_label = text_label.data_ptr<int32_t>();\n\n  for (int i = 0; i < height; i++) {\n    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;\n    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;\n    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;\n\n    for (int j = 0, k = 0; j < width && k < width * embedding_dim;\n         j++, k += embedding_dim) {\n      int32_t label = ptr_kernel_label_tmp[j];\n      if (label > 0) {\n        for (int d = 0; d < embedding_dim; d++)\n          kernel_vector[label][d] += ptr_embedding_tmp[k + d];\n        kernel_vector[label][embedding_dim] += 1;\n        // kernel pixel number\n        if (ptr_kernel_contour_tmp[j]) {\n          contour_pixels.push(std::make_tuple(i, j, label));\n        }\n      }\n    }\n  }\n  for (int i = 0; i < kernel_region_num; i++) {\n    for (int j = 0; j < embedding_dim; j++) {\n      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];\n    }\n  }\n  int dx[4] = {-1, 1, 0, 0};\n  int dy[4] = {0, 0, -1, 1};\n  while (!contour_pixels.empty()) {\n    auto query_pixel = contour_pixels.front();\n    contour_pixels.pop();\n    int y = std::get<0>(query_pixel);\n    int x = std::get<1>(query_pixel);\n    int32_t l = std::get<2>(query_pixel);\n    auto kernel_cv = kernel_vector[l];\n    for (int idx = 0; idx < 4; idx++) {\n      int tmpy = y + dy[idx];\n      int tmpx = x + dx[idx];\n      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;\n      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;\n      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)\n        continue;\n\n      float dis = 0;\n      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;\n      for (size_t i = 0; i < size_t(embedding_dim); i++) {\n        dis +=\n            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);\n        // ignore further computing if dis is big enough\n        if (dis >= threshold_square) break;\n      }\n      if (dis >= threshold_square) continue;\n      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));\n      ptr_text_label_tmp[tmpx] = l;\n    }\n  }\n\n  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,\n                             height, width);\n}\nstd::vector<std::vector<float>> pixel_group_impl(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float dis_threshold);\nREGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n\ninline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,\n                                      float &local_x, float &local_y) {\n  float cosa = cos(-rz), sina = sin(-rz);\n  local_x = shift_x * cosa + shift_y * (-sina);\n  local_y = shift_x * sina + shift_y * cosa;\n}\n\ninline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,\n                                 float &local_x, float &local_y) {\n  // param pt: (x, y, z)\n  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,\n  // cz in the bottom center\n  float x = pt[0], y = pt[1], z = pt[2];\n  float cx = box3d[0], cy = box3d[1], cz = box3d[2];\n  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];\n  cz += z_size /\n        2.0;  // shift to the center since cz in box3d is the bottom center\n\n  if (fabsf(z - cz) > z_size / 2.0) return 0;\n  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);\n  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &\n                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);\n  return in_flag;\n}\n\nvoid points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor pts_indices_tensor) {\n  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:\n  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)\n\n  CHECK_CONTIGUOUS(boxes_tensor);\n  CHECK_CONTIGUOUS(pts_tensor);\n  CHECK_CONTIGUOUS(pts_indices_tensor);\n\n  int boxes_num = boxes_tensor.size(0);\n  int pts_num = pts_tensor.size(0);\n\n  const float *boxes = boxes_tensor.data_ptr<float>();\n  const float *pts = pts_tensor.data_ptr<float>();\n  int *pts_indices = pts_indices_tensor.data_ptr<int>();\n\n  float local_x = 0, local_y = 0;\n  for (int i = 0; i < boxes_num; i++) {\n    for (int j = 0; j < pts_num; j++) {\n      int cur_in_flag =\n          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);\n      pts_indices[i * pts_num + j] = cur_in_flag;\n    }\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/psamask.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/hszhao/semseg/blob/master/lib/psa/src\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n#ifndef min\n#define min(a, b) (((a) < (b)) ? (a) : (b))\n#endif\n#ifndef max\n#define max(a, b) (((a) > (b)) ? (a) : (b))\n#endif\n\nvoid psamask_collect_forward(const int num_, const int h_feature,\n                             const int w_feature, const int h_mask,\n                             const int w_mask, const int half_h_mask,\n                             const int half_w_mask, const Tensor mask_data,\n                             Tensor buffer_data) {\n  for (int n = 0; n < num_; n++) {\n    for (int h = 0; h < h_feature; h++) {\n      for (int w = 0; w < w_feature; w++) {\n        // effective mask region : [hstart, hend) x [wstart, wend) with\n        // mask-indexed\n        const int hstart = max(0, half_h_mask - h);\n        const int hend = min(h_mask, h_feature + half_h_mask - h);\n        const int wstart = max(0, half_w_mask - w);\n        const int wend = min(w_mask, w_feature + half_w_mask - w);\n        // (hidx,                    widx                   ) with mask-indexed\n        // (hidx + h - half_h_mask, widx + w - half_w_mask) with\n        // feature-indexed\n        for (int hidx = hstart; hidx < hend; hidx++) {\n          for (int widx = wstart; widx < wend; widx++) {\n            buffer_data.view({-1})[(n * h_feature * w_feature +\n                                    (hidx + h - half_h_mask) * w_feature +\n                                    (widx + w - half_w_mask)) *\n                                       h_feature * w_feature +\n                                   h * w_feature + w] =\n                mask_data.view(\n                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *\n                               h_feature +\n                           h) *\n                              w_feature +\n                          w];\n          }\n        }\n      }\n    }\n  }\n}\n\nvoid psamask_distribute_forward(const int num_, const int h_feature,\n                                const int w_feature, const int h_mask,\n                                const int w_mask, const int half_h_mask,\n                                const int half_w_mask, const Tensor mask_data,\n                                Tensor buffer_data) {\n  for (int n = 0; n < num_; n++) {\n    for (int h = 0; h < h_feature; h++) {\n      for (int w = 0; w < w_feature; w++) {\n        // effective mask region : [hstart, hend) x [wstart, wend) with\n        // mask-indexed\n        const int hstart = max(0, half_h_mask - h);\n        const int hend = min(h_mask, h_feature + half_h_mask - h);\n        const int wstart = max(0, half_w_mask - w);\n        const int wend = min(w_mask, w_feature + half_w_mask - w);\n        // (hidx,                    widx                   ) with mask-indexed\n        // (hidx + h - half_h_mask, widx + w - half_w_mask) with\n        // feature-indexed\n        for (int hidx = hstart; hidx < hend; hidx++) {\n          for (int widx = wstart; widx < wend; widx++) {\n            buffer_data.view(\n                {-1})[(n * h_feature * w_feature + h * w_feature + w) *\n                          h_feature * w_feature +\n                      (hidx + h - half_h_mask) * w_feature +\n                      (widx + w - half_w_mask)] =\n                mask_data.view(\n                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *\n                               h_feature +\n                           h) *\n                              w_feature +\n                          w];\n          }\n        }\n      }\n    }\n  }\n}\n\nvoid psamask_collect_backward(const int num_, const int h_feature,\n                              const int w_feature, const int h_mask,\n                              const int w_mask, const int half_h_mask,\n                              const int half_w_mask, const Tensor buffer_diff,\n                              Tensor mask_diff) {\n  for (int n = 0; n < num_; n++) {\n    for (int h = 0; h < h_feature; h++) {\n      for (int w = 0; w < w_feature; w++) {\n        // effective mask region : [hstart, hend) x [wstart, wend) with\n        // mask-indexed\n        const int hstart = max(0, half_h_mask - h);\n        const int hend = min(h_mask, h_feature + half_h_mask - h);\n        const int wstart = max(0, half_w_mask - w);\n        const int wend = min(w_mask, w_feature + half_w_mask - w);\n        // (hidx,                    widx                   ) with mask-indexed\n        // (hidx + h - half_h_mask, widx + w - half_w_mask) with\n        // feature-indexed\n        for (int hidx = hstart; hidx < hend; hidx++) {\n          for (int widx = wstart; widx < wend; widx++) {\n            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *\n                                      h_feature +\n                                  h) *\n                                     w_feature +\n                                 w] =\n                buffer_diff.view({-1})[(n * h_feature * w_feature +\n                                        (hidx + h - half_h_mask) * w_feature +\n                                        (widx + w - half_w_mask)) *\n                                           h_feature * w_feature +\n                                       h * w_feature + w];\n          }\n        }\n      }\n    }\n  }\n}\n\nvoid psamask_distribute_backward(const int num_, const int h_feature,\n                                 const int w_feature, const int h_mask,\n                                 const int w_mask, const int half_h_mask,\n                                 const int half_w_mask,\n                                 const Tensor buffer_diff, Tensor mask_diff) {\n  for (int n = 0; n < num_; n++) {\n    for (int h = 0; h < h_feature; h++) {\n      for (int w = 0; w < w_feature; w++) {\n        // effective mask region : [hstart, hend) x [wstart, wend) with\n        // mask-indexed\n        const int hstart = max(0, half_h_mask - h);\n        const int hend = min(h_mask, h_feature + half_h_mask - h);\n        const int wstart = max(0, half_w_mask - w);\n        const int wend = min(w_mask, w_feature + half_w_mask - w);\n        // (hidx,                    widx                   ) with mask-indexed\n        // (hidx + h - half_h_mask, widx + w - half_w_mask) with\n        // feature-indexed\n        for (int hidx = hstart; hidx < hend; hidx++) {\n          for (int widx = wstart; widx < wend; widx++) {\n            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *\n                                      h_feature +\n                                  h) *\n                                     w_feature +\n                                 w] =\n                buffer_diff.view(\n                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *\n                              h_feature * w_feature +\n                          (hidx + h - half_h_mask) * w_feature +\n                          (widx + w - half_w_mask)];\n          }\n        }\n      }\n    }\n  }\n}\n\nvoid psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,\n                         const int num_, const int h_feature,\n                         const int w_feature, const int h_mask,\n                         const int w_mask, const int half_h_mask,\n                         const int half_w_mask) {\n  if (psa_type == 0)\n    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,\n                            half_h_mask, half_w_mask, input, output);\n  else\n    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,\n                               half_h_mask, half_w_mask, input, output);\n}\n\nvoid psamask_backward_cpu(const int psa_type, const Tensor grad_output,\n                          Tensor grad_input, const int num_,\n                          const int h_feature, const int w_feature,\n                          const int h_mask, const int w_mask,\n                          const int half_h_mask, const int half_w_mask) {\n  if (psa_type == 0)\n    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,\n                             half_h_mask, half_w_mask, grad_output, grad_input);\n  else\n    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,\n                                half_h_mask, half_w_mask, grad_output,\n                                grad_input);\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\nREGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);\nREGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/roi_align.cpp",
    "content": "// Modified from\n// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign\n// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include <ATen/ATen.h>\n#include <ATen/TensorUtils.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n// implementation taken from Caffe2\ntemplate <typename T>\nstruct PreCalc {\n  int pos1;\n  int pos2;\n  int pos3;\n  int pos4;\n  T w1;\n  T w2;\n  T w3;\n  T w4;\n};\n\ntemplate <typename T>\nvoid pre_calc_for_bilinear_interpolate(\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int iy_upper, const int ix_upper,\n    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,\n    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {\n  int pre_calc_index = 0;\n  for (int ph = 0; ph < pooled_height; ph++) {\n    for (int pw = 0; pw < pooled_width; pw++) {\n      for (int iy = 0; iy < iy_upper; iy++) {\n        const T yy = roi_start_h + ph * bin_size_h +\n                     static_cast<T>(iy + .5f) * bin_size_h /\n                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n        for (int ix = 0; ix < ix_upper; ix++) {\n          const T xx = roi_start_w + pw * bin_size_w +\n                       static_cast<T>(ix + .5f) * bin_size_w /\n                           static_cast<T>(roi_bin_grid_w);\n\n          T x = xx;\n          T y = yy;\n          // deal with: inverse elements are out of feature map boundary\n          if (y < -1.0 || y > height || x < -1.0 || x > width) {\n            // empty\n            PreCalc<T> pc;\n            pc.pos1 = 0;\n            pc.pos2 = 0;\n            pc.pos3 = 0;\n            pc.pos4 = 0;\n            pc.w1 = 0;\n            pc.w2 = 0;\n            pc.w3 = 0;\n            pc.w4 = 0;\n            pre_calc[pre_calc_index] = pc;\n            pre_calc_index += 1;\n            continue;\n          }\n\n          if (y <= 0) {\n            y = 0;\n          }\n          if (x <= 0) {\n            x = 0;\n          }\n\n          int y_low = (int)y;\n          int x_low = (int)x;\n          int y_high;\n          int x_high;\n\n          if (y_low >= height - 1) {\n            y_high = y_low = height - 1;\n            y = (T)y_low;\n          } else {\n            y_high = y_low + 1;\n          }\n\n          if (x_low >= width - 1) {\n            x_high = x_low = width - 1;\n            x = (T)x_low;\n          } else {\n            x_high = x_low + 1;\n          }\n\n          T ly = y - y_low;\n          T lx = x - x_low;\n          T hy = 1. - ly, hx = 1. - lx;\n          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n          // save weights and indices\n          PreCalc<T> pc;\n          pc.pos1 = y_low * width + x_low;\n          pc.pos2 = y_low * width + x_high;\n          pc.pos3 = y_high * width + x_low;\n          pc.pos4 = y_high * width + x_high;\n          pc.w1 = w1;\n          pc.w2 = w2;\n          pc.w3 = w3;\n          pc.w4 = w4;\n          pre_calc[pre_calc_index] = pc;\n\n          pre_calc_index += 1;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid ROIAlignForward(const int nthreads, const T* input, const T* rois,\n                     T* output, T* argmax_y, T* argmax_x,\n                     const int pooled_height, const int pooled_width,\n                     const T spatial_scale, const int sampling_ratio,\n                     const int pool_mode,  // 0 - max pool, 1 - avg pool\n                     const bool aligned, const int channels, const int height,\n                     const int width) {\n  int n_rois = nthreads / channels / pooled_width / pooled_height;\n  // (n, c, ph, pw) is an element in the pooled output\n  // can be parallelized using omp\n  // #pragma omp parallel for num_threads(32)\n  for (int n = 0; n < n_rois; n++) {\n    int index_n = n * channels * pooled_width * pooled_height;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_start_w = offset_rois[1] * spatial_scale - offset;\n    T roi_start_h = offset_rois[2] * spatial_scale - offset;\n    T roi_end_w = offset_rois[3] * spatial_scale - offset;\n    T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"ROIs in ROIAlign cannot have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // When the grid is empty, output zeros == 0/1, instead of NaN.\n    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    // we want to precalculate indices and weights shared by all channels,\n    // this is the key point of optimization\n    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *\n                                     pooled_width * pooled_height);\n    pre_calc_for_bilinear_interpolate(\n        height, width, pooled_height, pooled_width, roi_bin_grid_h,\n        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,\n        roi_bin_grid_h, roi_bin_grid_w, pre_calc);\n\n    for (int c = 0; c < channels; c++) {\n      int index_n_c = index_n + c * pooled_width * pooled_height;\n      const T* offset_input =\n          input + (roi_batch_ind * channels + c) * height * width;\n      int pre_calc_index = 0;\n\n      for (int ph = 0; ph < pooled_height; ph++) {\n        for (int pw = 0; pw < pooled_width; pw++) {\n          int index = index_n_c + ph * pooled_width + pw;\n\n          T output_val = 0.;\n          T maxval = -10000;\n          T maxidx_y = -1.f, maxidx_x = -1.f;\n          for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n            const T y = roi_start_h + ph * bin_size_h +\n                        static_cast<T>(iy + .5f) * bin_size_h /\n                            static_cast<T>(roi_bin_grid_h);\n            for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n              const T x = roi_start_w + pw * bin_size_w +\n                          static_cast<T>(ix + .5f) * bin_size_w /\n                              static_cast<T>(roi_bin_grid_w);\n              PreCalc<T> pc = pre_calc[pre_calc_index];\n              T val = pc.w1 * offset_input[pc.pos1] +\n                      pc.w2 * offset_input[pc.pos2] +\n                      pc.w3 * offset_input[pc.pos3] +\n                      pc.w4 * offset_input[pc.pos4];\n              if (val > maxval) {\n                maxval = val;\n                maxidx_y = y;\n                maxidx_x = x;\n              }\n              output_val += val;\n              pre_calc_index += 1;\n            }\n          }\n          if (pool_mode == 0) {\n            // We do max pooling inside a bin\n            output[index] = maxval;\n            argmax_y[index] = maxidx_y;\n            argmax_x[index] = maxidx_x;\n          } else if (pool_mode == 1) {\n            // We do average (integral) pooling inside a bin\n            output[index] = output_val / count;\n          }  // if\n        }  // for pw\n      }  // for ph\n    }  // for c\n  }  // for n\n}\n\ntemplate <typename T>\nvoid bilinear_interpolate_gradient(const int height, const int width, T y, T x,\n                                   T& w1, T& w2, T& w3, T& w4, int& x_low,\n                                   int& x_high, int& y_low, int& y_high,\n                                   const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  // reference in forward\n  // T v1 = input[y_low * width + x_low];\n  // T v2 = input[y_low * width + x_high];\n  // T v3 = input[y_high * width + x_low];\n  // T v4 = input[y_high * width + x_high];\n  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  return;\n}\n\ntemplate <class T>\ninline void add(T* address, const T& val) {\n  *address += val;\n}\n\ntemplate <typename T>\nvoid ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,\n                      const T* argmax_y, const T* argmax_x, T* grad_input,\n                      const int pooled_height, const int pooled_width,\n                      const T spatial_scale, const int sampling_ratio,\n                      const int pool_mode,  // 0 - max pool, 1 - avg pool\n                      const bool aligned, const int channels, const int height,\n                      const int width, const int n_stride, const int c_stride,\n                      const int h_stride, const int w_stride) {\n  for (int index = 0; index < nthreads; index++) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* offset_rois = rois + n * 5;\n    int roi_batch_ind = offset_rois[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_start_w = offset_rois[1] * spatial_scale - offset;\n    T roi_start_h = offset_rois[2] * spatial_scale - offset;\n    T roi_end_w = offset_rois[3] * spatial_scale - offset;\n    T roi_end_h = offset_rois[4] * spatial_scale - offset;\n\n    T roi_width = roi_end_w - roi_start_w;\n    T roi_height = roi_end_h - roi_start_h;\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"ROIs in ROIAlign do not have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    int output_offset = n * n_stride + c * c_stride;\n    const T* offset_grad_output = grad_output + output_offset;\n    const T grad_output_this_bin =\n        offset_grad_output[ph * h_stride + pw * w_stride];\n\n    if (pool_mode == 0) {\n      // We do max pooling inside a bin\n      T y = argmax_y[index], x = argmax_x[index];\n      if (y != -1.f) {\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high, index);\n\n        T g1 = grad_output_this_bin * w1;\n        T g2 = grad_output_this_bin * w2;\n        T g3 = grad_output_this_bin * w3;\n        T g4 = grad_output_this_bin * w4;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          // atomic add is not needed for now since it is single threaded\n          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));\n          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));\n          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));\n          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));\n        }  // if\n      }  // mode\n    } else if (pool_mode == 1) {\n      // We do average (integral) pooling inside a bin\n      // We use roi_bin_grid to sample the grid and mimic integral\n      int roi_bin_grid_h =\n          (sampling_ratio > 0)\n              ? sampling_ratio\n              : ceilf(roi_height / pooled_height);  // e.g., = 2\n      int roi_bin_grid_w = (sampling_ratio > 0)\n                               ? sampling_ratio\n                               : ceilf(roi_width / pooled_width);\n\n      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n      for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n        const T y = roi_start_h + ph * bin_size_h +\n                    static_cast<T>(iy + .5f) * bin_size_h /\n                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n        for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n          const T x = roi_start_w + pw * bin_size_w +\n                      static_cast<T>(ix + .5f) * bin_size_w /\n                          static_cast<T>(roi_bin_grid_w);\n\n          T w1, w2, w3, w4;\n          int x_low, x_high, y_low, y_high;\n\n          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                        x_low, x_high, y_low, y_high, index);\n\n          T g1 = grad_output_this_bin * w1 / count;\n          T g2 = grad_output_this_bin * w2 / count;\n          T g3 = grad_output_this_bin * w3 / count;\n          T g4 = grad_output_this_bin * w4 / count;\n\n          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n            // atomic add is not needed for now since it is single threaded\n            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));\n            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));\n            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));\n            add(offset_grad_input + y_high * width + x_high,\n                static_cast<T>(g4));\n          }  // if\n        }  // ix\n      }  // iy\n    }  // mode\n  }  // for\n}  // ROIAlignBackward\n\nvoid ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,\n                                Tensor argmax_y, Tensor argmax_x,\n                                int aligned_height, int aligned_width,\n                                float spatial_scale, int sampling_ratio,\n                                int pool_mode, bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"ROIAlign_forward\", [&] {\n        ROIAlignForward<scalar_t>(\n            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),\n            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),\n            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,\n            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,\n            aligned, channels, height, width);\n      });\n}\n\nvoid ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,\n                                 Tensor argmax_y, Tensor argmax_x,\n                                 Tensor grad_input, int aligned_height,\n                                 int aligned_width, float spatial_scale,\n                                 int sampling_ratio, int pool_mode,\n                                 bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  // get stride values to ensure indexing into gradients is correct.\n  int n_stride = grad_output.stride(0);\n  int c_stride = grad_output.stride(1);\n  int h_stride = grad_output.stride(2);\n  int w_stride = grad_output.stride(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"ROIAlign_backward\", [&] {\n        ROIAlignBackward<scalar_t>(\n            output_size, grad_output.data_ptr<scalar_t>(),\n            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),\n            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),\n            sampling_ratio, pool_mode, aligned, channels, height, width,\n            n_stride, c_stride, h_stride, w_stride);\n      });\n}\n\nvoid roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                           int aligned_width, float spatial_scale,\n                           int sampling_ratio, int pool_mode, bool aligned) {\n  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,\n                             aligned_height, aligned_width, spatial_scale,\n                             sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                            Tensor argmax_x, Tensor grad_input,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,\n                              aligned_height, aligned_width, spatial_scale,\n                              sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);\nREGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp",
    "content": "// Modified from\n// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated\n// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include <ATen/ATen.h>\n#include <ATen/TensorUtils.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n// implementation taken from Caffe2\ntemplate <typename T>\nstruct PreCalc {\n  int pos1;\n  int pos2;\n  int pos3;\n  int pos4;\n  T w1;\n  T w2;\n  T w3;\n  T w4;\n};\n\ntemplate <typename T>\nvoid pre_calc_for_bilinear_interpolate(\n    const int height, const int width, const int pooled_height,\n    const int pooled_width, const int iy_upper, const int ix_upper,\n    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,\n    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,\n    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {\n  int pre_calc_index = 0;\n  for (int ph = 0; ph < pooled_height; ph++) {\n    for (int pw = 0; pw < pooled_width; pw++) {\n      for (int iy = 0; iy < iy_upper; iy++) {\n        const T yy = roi_start_h + ph * bin_size_h +\n                     static_cast<T>(iy + .5f) * bin_size_h /\n                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n        for (int ix = 0; ix < ix_upper; ix++) {\n          const T xx = roi_start_w + pw * bin_size_w +\n                       static_cast<T>(ix + .5f) * bin_size_w /\n                           static_cast<T>(roi_bin_grid_w);\n\n          // Rotate by theta around the center and translate\n          // In image space, (y, x) is the order for Right Handed System,\n          // and this is essentially multiplying the point by a rotation matrix\n          // to rotate it counterclockwise through angle theta.\n          T y = yy * cos_theta - xx * sin_theta + roi_center_h;\n          T x = yy * sin_theta + xx * cos_theta + roi_center_w;\n          // deal with: inverse elements are out of feature map boundary\n          if (y < -1.0 || y > height || x < -1.0 || x > width) {\n            // empty\n            PreCalc<T> pc;\n            pc.pos1 = 0;\n            pc.pos2 = 0;\n            pc.pos3 = 0;\n            pc.pos4 = 0;\n            pc.w1 = 0;\n            pc.w2 = 0;\n            pc.w3 = 0;\n            pc.w4 = 0;\n            pre_calc[pre_calc_index] = pc;\n            pre_calc_index += 1;\n            continue;\n          }\n\n          if (y < 0) {\n            y = 0;\n          }\n          if (x < 0) {\n            x = 0;\n          }\n\n          int y_low = (int)y;\n          int x_low = (int)x;\n          int y_high;\n          int x_high;\n\n          if (y_low >= height - 1) {\n            y_high = y_low = height - 1;\n            y = (T)y_low;\n          } else {\n            y_high = y_low + 1;\n          }\n\n          if (x_low >= width - 1) {\n            x_high = x_low = width - 1;\n            x = (T)x_low;\n          } else {\n            x_high = x_low + 1;\n          }\n\n          T ly = y - y_low;\n          T lx = x - x_low;\n          T hy = 1. - ly, hx = 1. - lx;\n          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n          // save weights and indices\n          PreCalc<T> pc;\n          pc.pos1 = y_low * width + x_low;\n          pc.pos2 = y_low * width + x_high;\n          pc.pos3 = y_high * width + x_low;\n          pc.pos4 = y_high * width + x_high;\n          pc.w1 = w1;\n          pc.w2 = w2;\n          pc.w3 = w3;\n          pc.w4 = w4;\n          pre_calc[pre_calc_index] = pc;\n\n          pre_calc_index += 1;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename T>\nvoid ROIAlignRotatedForward(const int nthreads, const T* input,\n                            const T& spatial_scale, const bool aligned,\n                            const bool clockwise, const int channels,\n                            const int height, const int width,\n                            const int pooled_height, const int pooled_width,\n                            const int sampling_ratio, const T* rois,\n                            T* output) {\n  int n_rois = nthreads / channels / pooled_width / pooled_height;\n  // (n, c, ph, pw) is an element in the pooled output\n  // can be parallelized using omp\n  // #pragma omp parallel for num_threads(32)\n  for (int n = 0; n < n_rois; n++) {\n    int index_n = n * channels * pooled_width * pooled_height;\n\n    const T* current_roi = rois + n * 6;\n    int roi_batch_ind = current_roi[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_center_w = current_roi[1] * spatial_scale - offset;\n    T roi_center_h = current_roi[2] * spatial_scale - offset;\n    T roi_width = current_roi[3] * spatial_scale;\n    T roi_height = current_roi[4] * spatial_scale;\n    T theta = current_roi[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    T cos_theta = cos(theta);\n    T sin_theta = sin(theta);\n\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"ROIs in ROIAlignRotated do not have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // We do average (integral) pooling inside a bin\n    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4\n\n    // we want to precalculate indices and weights shared by all channels,\n    // this is the key point of optimization\n    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *\n                                     pooled_width * pooled_height);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    T roi_start_h = -roi_height / 2.0;\n    T roi_start_w = -roi_width / 2.0;\n\n    pre_calc_for_bilinear_interpolate(\n        height, width, pooled_height, pooled_width, roi_bin_grid_h,\n        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,\n        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,\n        sin_theta, pre_calc);\n\n    for (int c = 0; c < channels; c++) {\n      int index_n_c = index_n + c * pooled_width * pooled_height;\n      const T* offset_input =\n          input + (roi_batch_ind * channels + c) * height * width;\n      int pre_calc_index = 0;\n\n      for (int ph = 0; ph < pooled_height; ph++) {\n        for (int pw = 0; pw < pooled_width; pw++) {\n          int index = index_n_c + ph * pooled_width + pw;\n\n          T output_val = 0.;\n          for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n            for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n              PreCalc<T> pc = pre_calc[pre_calc_index];\n              output_val += pc.w1 * offset_input[pc.pos1] +\n                            pc.w2 * offset_input[pc.pos2] +\n                            pc.w3 * offset_input[pc.pos3] +\n                            pc.w4 * offset_input[pc.pos4];\n\n              pre_calc_index += 1;\n            }\n          }\n          output_val /= count;\n\n          output[index] = output_val;\n        }  // for pw\n      }  // for ph\n    }  // for c\n  }  // for n\n}\n\ntemplate <typename T>\nvoid bilinear_interpolate_gradient(const int height, const int width, T y, T x,\n                                   T& w1, T& w2, T& w3, T& w4, int& x_low,\n                                   int& x_high, int& y_low, int& y_high) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y < 0) {\n    y = 0;\n  }\n\n  if (x < 0) {\n    x = 0;\n  }\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  // reference in forward\n  // T v1 = input[y_low * width + x_low];\n  // T v2 = input[y_low * width + x_high];\n  // T v3 = input[y_high * width + x_low];\n  // T v4 = input[y_high * width + x_high];\n  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  return;\n}\n\ntemplate <class T>\ninline void add(T* address, const T& val) {\n  *address += val;\n}\n\ntemplate <typename T>\nvoid ROIAlignRotatedBackward(\n    const int nthreads,\n    // may not be contiguous. should index using n_stride, etc\n    const T* grad_output, const T& spatial_scale, const bool aligned,\n    const bool clockwise, const int channels, const int height, const int width,\n    const int pooled_height, const int pooled_width, const int sampling_ratio,\n    T* grad_input, const T* rois, const int n_stride, const int c_stride,\n    const int h_stride, const int w_stride) {\n  for (int index = 0; index < nthreads; index++) {\n    // (n, c, ph, pw) is an element in the pooled output\n    int pw = index % pooled_width;\n    int ph = (index / pooled_width) % pooled_height;\n    int c = (index / pooled_width / pooled_height) % channels;\n    int n = index / pooled_width / pooled_height / channels;\n\n    const T* current_roi = rois + n * 6;\n    int roi_batch_ind = current_roi[0];\n\n    // Do not use rounding; this implementation detail is critical\n    T offset = aligned ? (T)0.5 : (T)0.0;\n    T roi_center_w = current_roi[1] * spatial_scale - offset;\n    T roi_center_h = current_roi[2] * spatial_scale - offset;\n    T roi_width = current_roi[3] * spatial_scale;\n    T roi_height = current_roi[4] * spatial_scale;\n    T theta = current_roi[5];\n    if (clockwise) {\n      theta = -theta;  // If clockwise, the angle needs to be reversed.\n    }\n    T cos_theta = cos(theta);\n    T sin_theta = sin(theta);\n\n    if (aligned) {\n      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,\n                 \"ROIs in ROIAlignRotated do not have non-negative size!\");\n    } else {  // for backward-compatibility only\n      roi_width = std::max(roi_width, (T)1.);\n      roi_height = std::max(roi_height, (T)1.);\n    }\n\n    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);\n\n    T* offset_grad_input =\n        grad_input + ((roi_batch_ind * channels + c) * height * width);\n\n    int output_offset = n * n_stride + c * c_stride;\n    const T* offset_grad_output = grad_output + output_offset;\n    const T grad_output_this_bin =\n        offset_grad_output[ph * h_stride + pw * w_stride];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid_h = (sampling_ratio > 0)\n                             ? sampling_ratio\n                             : ceilf(roi_height / pooled_height);  // e.g., = 2\n    int roi_bin_grid_w =\n        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);\n\n    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).\n    // Appropriate translation needs to be applied after.\n    T roi_start_h = -roi_height / 2.0;\n    T roi_start_w = -roi_width / 2.0;\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4\n\n    for (int iy = 0; iy < roi_bin_grid_h; iy++) {\n      const T yy = roi_start_h + ph * bin_size_h +\n                   static_cast<T>(iy + .5f) * bin_size_h /\n                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5\n      for (int ix = 0; ix < roi_bin_grid_w; ix++) {\n        const T xx = roi_start_w + pw * bin_size_w +\n                     static_cast<T>(ix + .5f) * bin_size_w /\n                         static_cast<T>(roi_bin_grid_w);\n\n        // Rotate by theta around the center and translate\n        T y = yy * cos_theta - xx * sin_theta + roi_center_h;\n        T x = yy * sin_theta + xx * cos_theta + roi_center_w;\n\n        T w1, w2, w3, w4;\n        int x_low, x_high, y_low, y_high;\n\n        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,\n                                      x_low, x_high, y_low, y_high);\n\n        T g1 = grad_output_this_bin * w1 / count;\n        T g2 = grad_output_this_bin * w2 / count;\n        T g3 = grad_output_this_bin * w3 / count;\n        T g4 = grad_output_this_bin * w4 / count;\n\n        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n          // atomic add is not needed for now since it is single threaded\n          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));\n          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));\n          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));\n          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));\n        }  // if\n      }  // ix\n    }  // iy\n  }  // for\n}  // ROIAlignRotatedBackward\n\nvoid ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       bool aligned, bool clockwise) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"ROIAlignRotated_forward\", [&] {\n        ROIAlignRotatedForward<scalar_t>(\n            output_size, input.data_ptr<scalar_t>(),\n            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,\n            height, width, aligned_height, aligned_width, sampling_ratio,\n            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());\n      });\n}\n\nvoid ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, bool aligned,\n                                        bool clockwise) {\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  // get stride values to ensure indexing into gradients is correct.\n  int n_stride = grad_output.stride(0);\n  int c_stride = grad_output.stride(1);\n  int h_stride = grad_output.stride(2);\n  int w_stride = grad_output.stride(3);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"ROIAlignRotated_backward\", [&] {\n        ROIAlignRotatedBackward<scalar_t>(\n            grad_output.numel(), grad_output.data_ptr<scalar_t>(),\n            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,\n            height, width, aligned_height, aligned_width, sampling_ratio,\n            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),\n            n_stride, c_stride, h_stride, w_stride);\n      });\n}\n\nvoid roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,\n                                   int aligned_height, int aligned_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   bool aligned, bool clockwise) {\n  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,\n                                    aligned_width, spatial_scale,\n                                    sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,\n                                    Tensor bottom_grad, int aligned_height,\n                                    int aligned_width, float spatial_scale,\n                                    int sampling_ratio, bool aligned,\n                                    bool clockwise) {\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  ROIAlignRotatedBackwardCPULauncher(\n      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,\n      sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\nREGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,\n                     roi_align_rotated_forward_cpu);\nREGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,\n                     roi_align_rotated_backward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp",
    "content": "// modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T>\nT bilinear_interpolate(const T* input, const int height, const int width, T y,\n                       T x, const int index /* index for debug only*/) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  int y_low = (int)y;\n  int x_low = (int)x;\n  int y_high;\n  int x_high;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  // do bilinear interpolation\n  T v1 = input[y_low * width + x_low];\n  T v2 = input[y_low * width + x_high];\n  T v3 = input[y_high * width + x_low];\n  T v4 = input[y_high * width + x_high];\n  const T v_low = fma(v2 - v1, lx, v1);\n  const T v_high = fma(v4 - v3, lx, v3);\n  const T val = fma(v_high - v_low, ly, v_low);\n\n  return val;\n}\n\ntemplate <typename scalar_t>\nvoid rotated_feature_align_forward_cpu_kernel(\n    const int nthreads, const int points, const scalar_t* bottom_data,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width, scalar_t* top_data) {\n  for (int index = 0; index < nthreads; index++) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    const scalar_t* offset_bottom_data =\n        bottom_data + (n * channels + c) * height * width;\n\n    scalar_t output_val = bottom_data[index];\n    for (int i = 0; i < points; i++) {\n      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,\n                                                   width, py[i], px[i], i);\n    }\n    top_data[index] = output_val;\n  }\n}\n\ntemplate <typename T>\nvoid bilinear_interpolate_gradient(const int height, const int width, T y, T x,\n                                   T& w1, T& w2, T& w3, T& w4, int& x_low,\n                                   int& x_high, int& y_low, int& y_high,\n                                   const int index) {\n  // deal with cases that inverse elements are out of feature map boundary\n  if (y < -1.0 || y > height || x < -1.0 || x > width) {\n    // empty\n    w1 = w2 = w3 = w4 = 0.;\n    x_low = x_high = y_low = y_high = -1;\n    return;\n  }\n\n  if (y <= 0) y = 0;\n  if (x <= 0) x = 0;\n\n  y_low = (int)y;\n  x_low = (int)x;\n\n  if (y_low >= height - 1) {\n    y_high = y_low = height - 1;\n    y = (T)y_low;\n  } else {\n    y_high = y_low + 1;\n  }\n\n  if (x_low >= width - 1) {\n    x_high = x_low = width - 1;\n    x = (T)x_low;\n  } else {\n    x_high = x_low + 1;\n  }\n\n  T ly = y - y_low;\n  T lx = x - x_low;\n  T hy = 1. - ly, hx = 1. - lx;\n\n  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;\n\n  return;\n}\n\ntemplate <typename scalar_t>\ninline void valueAdd(scalar_t* address, scalar_t val) {\n  scalar_t old = *address;\n  *address = (old + val);\n}\n\ntemplate <typename scalar_t>\nvoid rotated_feature_align_backward_cpu_kernel(\n    const int nthreads, const int points, const scalar_t* top_diff,\n    const scalar_t* best_bboxes, const scalar_t spatial_scale,\n    const int channels, const int height, const int width,\n    scalar_t* bottom_diff) {\n  for (int index = 0; index < nthreads; index++) {\n    int w = index % width;\n    int h = (index / width) % height;\n    int c = (index / width / height) % channels;\n    int n = index / width / height / channels;\n\n    const scalar_t* bbox_offset =\n        best_bboxes + ((n * height + h) * width + w) * 5;\n    scalar_t roi_y = bbox_offset[0] * spatial_scale;\n    scalar_t roi_x = bbox_offset[1] * spatial_scale;\n\n    scalar_t px[5] = {roi_x, 0, 0, 0, 0};\n    scalar_t py[5] = {roi_y, 0, 0, 0, 0};\n\n    if (points > 1) {\n      scalar_t roi_w = bbox_offset[2] * spatial_scale;\n      scalar_t roi_h = bbox_offset[3] * spatial_scale;\n      scalar_t roi_a = bbox_offset[4];\n\n      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;\n      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);\n      scalar_t wx = cosa * w_2, wy = sina * w_2;\n      scalar_t hx = -sina * h_2, hy = cosa * h_2;\n\n      px[1] = roi_x + wx + hx;\n      py[1] = roi_y + wy + hy;\n      px[2] = roi_x - wx + hx;\n      py[2] = roi_y - wy + hy;\n      px[3] = roi_x - wx - hx;\n      py[3] = roi_y - wy - hy;\n      px[4] = roi_x + wx - hx;\n      py[4] = roi_y + wy - hy;\n    }\n\n    scalar_t* offset_bottom_diff =\n        bottom_diff + (n * channels + c) * height * width;\n    scalar_t value_top_diff = top_diff[index];\n\n    valueAdd(bottom_diff + index, value_top_diff);\n    for (int i = 0; i < points; i++) {\n      scalar_t w1, w2, w3, w4;\n      int x_low, x_high, y_low, y_high;\n\n      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,\n                                              w2, w3, w4, x_low, x_high, y_low,\n                                              y_high, i);\n      scalar_t g1 = value_top_diff * w1;\n      scalar_t g2 = value_top_diff * w2;\n      scalar_t g3 = value_top_diff * w3;\n      scalar_t g4 = value_top_diff * w4;\n      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {\n        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);\n        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);\n        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);\n        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);\n      }\n    }\n  }\n}\n\nvoid rotated_feature_align_forward_cpu(const Tensor features,\n                                       const Tensor best_bboxes,\n                                       const float spatial_scale,\n                                       const int points, Tensor output) {\n  const int output_size = features.numel();\n  AT_DISPATCH_FLOATING_TYPES(\n      features.scalar_type(), \"rotated_feature_align_forward_cpu_kernel\", [&] {\n        const scalar_t* bottom_data = features.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* top_data = output.data_ptr<scalar_t>();\n\n        rotated_feature_align_forward_cpu_kernel<scalar_t>(\n            output_size, points, bottom_data, bboxes_data,\n            scalar_t(spatial_scale), features.size(1), features.size(2),\n            features.size(3), top_data);\n      });\n}\n\nvoid rotated_feature_align_backward_cpu(const Tensor top_grad,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor bottom_grad) {\n  const int output_size = top_grad.numel();\n  AT_DISPATCH_FLOATING_TYPES(\n      top_grad.scalar_type(), \"rotated_feature_align_backward_cpu_kernel\", [&] {\n        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();\n\n        rotated_feature_align_backward_cpu_kernel<scalar_t>(\n            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),\n            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);\n      });\n}\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,\n                     rotated_feature_align_forward_cpu);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,\n                     rotated_feature_align_backward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <torch/script.h>\n#include <utils/spconv/spconv/geometry.h>\n#include <utils/spconv/spconv/indice.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nnamespace functor {\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    if (transpose)\n      return getIndicePairsDeConv<Index, IndexGrid, NDim>(\n          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,\n          kernelSize.data(), stride.data(), padding.data(), dilation.data(),\n          outSpatialShape.data());\n    else\n      return getIndicePairsConv<Index, IndexGrid, NDim>(\n          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,\n          kernelSize.data(), stride.data(), padding.data(), dilation.data(),\n          outSpatialShape.data());\n  }\n};\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    return getIndicePairsSubM<Index, IndexGrid, NDim>(\n        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),\n        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \\\n  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \\\n                                                       NDIM>;               \\\n  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \\\n                                                       NDIM>;\n\n#define DECLARE_CPU_INDEX(Index)          \\\n  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \\\n  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \\\n  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \\\n  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);\n\nDECLARE_CPU_INDEX(int);\nDECLARE_CPU_INDEX(long);\n\n#undef DECLARE_CPU_INDEX\n#undef DECLARE_CPU_SPECS_INDEX_NDIM\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <torch/script.h>\n#include <utils/spconv/spconv/maxpool.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {\n  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const Index> indices, int size) {\n    int stride = outFeatures.dim(1);\n    auto outFeaturesData = outFeatures.data();\n    auto inFeaturesData = inFeatures.data();\n    auto indicesIn = indices.subview(0).data();\n    auto indicesOut = indices.subview(1).data();\n    Index idxi, idxo;\n    for (int row = 0; row < size; row++) {\n      idxi = indicesIn[row] * stride;\n      idxo = indicesOut[row] * stride;\n      for (int plane = 0; plane < stride; ++plane)\n        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])\n          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];\n    }\n  }\n};\n\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {\n  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const scalar_t> fout,\n                  tv::TensorView<scalar_t> fin,\n                  tv::TensorView<const Index> indices, int size) {\n    int stride = outFeatures.dim(1);\n    auto outFeaturesData = outFeatures.data();\n    auto inFeaturesData = inFeatures.data();\n    auto foutData = fout.data();\n    auto finData = fin.data();\n    auto indicesIn = indices.subview(0).data();\n    auto indicesOut = indices.subview(1).data();\n    Index idxi, idxo;\n    for (int row = 0; row < size; row++) {\n      idxi = indicesIn[row] * stride;\n      idxo = indicesOut[row] * stride;\n      for (int plane = 0; plane < stride; ++plane)\n        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])\n          finData[idxi + plane] += foutData[idxo + plane];\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \\\n  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \\\n  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;\n\n#define DECLARE_CPU_SPECS(T)         \\\n  DECLARE_CPU_SPECS_T_INDEX(T, int); \\\n  DECLARE_CPU_SPECS_T_INDEX(T, long);\n\nDECLARE_CPU_SPECS(float);\nDECLARE_CPU_SPECS(double);\nDECLARE_CPU_SPECS(at::Half);\n\n#undef DECLARE_CPU_SPECS\n#undef DECLARE_CPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <torch/script.h>\n#include <utils/spconv/spconv/reordering.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseGatherFunctor<tv::CPU, scalar_t, Index> {\n  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,\n                  tv::TensorView<const scalar_t> features,\n                  tv::TensorView<const Index> indices, int size) {\n    int numPlanes = features.dim(1);\n    for (int i = 0; i < size; ++i) {\n      std::memcpy(buffer.data() + i * numPlanes,\n                  features.data() + indices[i] * numPlanes,\n                  sizeof(scalar_t) * numPlanes);\n    }\n  }\n};\n\ntemplate <typename scalar_t, typename Index>\nstruct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {\n  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> buffer,\n                  tv::TensorView<const Index> indices, int size, bool stable) {\n    int numPlanes = outFeatures.dim(1);\n    const scalar_t* buf = buffer.data();\n    scalar_t* out = outFeatures.data();\n    for (int i = 0; i < size; ++i) {\n      buf = buffer.data() + i * numPlanes;\n      out = outFeatures.data() + indices[i] * numPlanes;\n      for (int j = 0; j < numPlanes; ++j) {\n        out[j] += buf[j];\n      }\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \\\n  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \\\n  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;\n\n#define DECLARE_CPU_SPECS(scalar_t)         \\\n  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \\\n  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);\n\nDECLARE_CPU_SPECS(float);\nDECLARE_CPU_SPECS(double);\nDECLARE_CPU_SPECS(at::Half);\n\n#undef DECLARE_CPU_SPECS\n#undef DECLARE_CPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cpu/voxelization.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <typename T, typename T_int>\nvoid dynamic_voxelize_forward_cpu_kernel(\n    const torch::TensorAccessor<T, 2> points,\n    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const std::vector<int> grid_size,\n    const int num_points, const int num_features, const int NDim) {\n  const int ndim_minus_1 = NDim - 1;\n  bool failed = false;\n  // int coor[NDim];\n  int* coor = new int[NDim]();\n  int c;\n\n  for (int i = 0; i < num_points; ++i) {\n    failed = false;\n    for (int j = 0; j < NDim; ++j) {\n      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);\n      // necessary to rm points out of range\n      if ((c < 0 || c >= grid_size[j])) {\n        failed = true;\n        break;\n      }\n      coor[ndim_minus_1 - j] = c;\n    }\n\n    // memcpy and memset will cause problem because of the memory distribution\n    // discontinuity of TensorAccessor, so here using loops to replace memcpy\n    // or memset\n    if (failed) {\n      for (int k = 0; k < NDim; ++k) {\n        coors[i][k] = -1;\n      }\n    } else {\n      for (int k = 0; k < NDim; ++k) {\n        coors[i][k] = coor[k];\n      }\n    }\n  }\n\n  delete[] coor;\n  return;\n}\n\ntemplate <typename T, typename T_int>\nvoid hard_voxelize_forward_cpu_kernel(\n    const torch::TensorAccessor<T, 2> points,\n    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,\n    torch::TensorAccessor<T_int, 1> num_points_per_voxel,\n    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const std::vector<int> grid_size, const int max_points,\n    const int max_voxels, const int num_points, const int num_features,\n    const int NDim) {\n  // declare a temp coors\n  at::Tensor temp_coors = at::zeros(\n      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));\n\n  // First use dynamic voxelization to get coors,\n  // then check max points/voxels constraints\n  dynamic_voxelize_forward_cpu_kernel<T, int>(\n      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,\n      num_points, num_features, NDim);\n\n  int voxelidx, num;\n  auto coor = temp_coors.accessor<int, 2>();\n\n  for (int i = 0; i < num_points; ++i) {\n    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;\n\n    if (coor[i][0] == -1) continue;\n\n    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];\n\n    // record voxel\n    if (voxelidx == -1) {\n      voxelidx = voxel_num;\n      if (max_voxels != -1 && voxel_num >= max_voxels) continue;\n      voxel_num += 1;\n\n      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;\n      // memcpy will cause problem because of the memory distribution\n      // discontinuity of TensorAccessor, so here using loops to replace memcpy\n      for (int k = 0; k < NDim; ++k) {\n        coors[voxelidx][k] = coor[i][k];\n      }\n    }\n\n    // put points into voxel\n    num = num_points_per_voxel[voxelidx];\n    if (max_points == -1 || num < max_points) {\n      // memcpy will cause problem because of the memory distribution\n      // discontinuity of TensorAccessor, so here using loops to replace memcpy\n      for (int k = 0; k < num_features; ++k) {\n        voxels[voxelidx][num][k] = points[i][k];\n      }\n      num_points_per_voxel[voxelidx] += 1;\n    }\n  }\n\n  return;\n}\n\nvoid dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,\n                                  const std::vector<float> voxel_size,\n                                  const std::vector<float> coors_range,\n                                  const int NDim = 3) {\n  // check device\n  AT_ASSERTM(points.device().is_cpu(), \"points must be a CPU tensor\");\n\n  std::vector<int> grid_size(NDim);\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n\n  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"dynamic_voxelize_forward_cpu_kernel\", [&] {\n        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(\n            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),\n            voxel_size, coors_range, grid_size, num_points, num_features, NDim);\n      });\n}\n\nint hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,\n                              at::Tensor& coors,\n                              at::Tensor& num_points_per_voxel,\n                              const std::vector<float> voxel_size,\n                              const std::vector<float> coors_range,\n                              const int max_points, const int max_voxels,\n                              const int NDim = 3) {\n  // current version tooks about 0.02s_0.03s for one frame on cpu\n  // check device\n  AT_ASSERTM(points.device().is_cpu(), \"points must be a CPU tensor\");\n\n  std::vector<int> grid_size(NDim);\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  for (int i = 0; i < NDim; ++i) {\n    grid_size[i] =\n        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);\n  }\n\n  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor\n  // printf(\"cpu coor_to_voxelidx size: [%d, %d, %d]\\n\", grid_size[2],\n  // grid_size[1], grid_size[0]);\n  at::Tensor coor_to_voxelidx =\n      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());\n\n  int voxel_num = 0;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"hard_voxelize_forward_cpu_kernel\", [&] {\n        hard_voxelize_forward_cpu_kernel<scalar_t, int>(\n            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),\n            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),\n            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,\n            coors_range, grid_size, max_points, max_voxels, num_points,\n            num_features, NDim);\n      });\n\n  return voxel_num;\n}\n\nint hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,\n                               at::Tensor& coors,\n                               at::Tensor& num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim);\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim);\nREGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,\n                     hard_voxelize_forward_cpu);\nREGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,\n                     dynamic_voxelize_forward_cpu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu\n#include \"active_rotated_filter_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,\n                                                  const Tensor indices,\n                                                  Tensor output) {\n  int num_output_planes = input.size(0);\n  int num_input_planes = input.size(1);\n  int num_orientations = input.size(2);\n  int kH = input.size(3);\n  int kW = input.size(4);\n  int num_rotations = indices.size(3);\n  int nEntry = num_orientations * kH * kW;\n  int output_size = input.numel();\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"active_rotated_filter_forward_cuda_kernel\", [&] {\n        active_rotated_filter_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                indices.data_ptr<int>(), num_input_planes, num_output_planes,\n                num_orientations, num_rotations, nEntry,\n                output.data_ptr<scalar_t>());\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,\n                                                   const Tensor indices,\n                                                   Tensor grad_in) {\n  int num_orientations = indices.size(0);\n  int kH = indices.size(1);\n  int kW = indices.size(2);\n  int num_rotations = indices.size(3);\n  int num_output_planes = grad_out.size(0) / num_rotations;\n  int num_input_planes = grad_out.size(1) / num_orientations;\n  int nEntry = num_orientations * kH * kW;\n  int output_size = grad_in.numel();\n\n  at::cuda::CUDAGuard device_guard(indices.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"active_rotated_filter_backward_cuda_kernel\",\n      [&] {\n        active_rotated_filter_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_out.data_ptr<scalar_t>(),\n                indices.data_ptr<int>(), num_input_planes, num_output_planes,\n                num_orientations, num_rotations, nEntry,\n                grad_in.data_ptr<scalar_t>());\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"assign_score_withk_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid AssignScoreWithKForwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& points, const Tensor& centers, const Tensor& scores,\n    const Tensor& knn_idx, Tensor& output) {\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"assign_score_withk_forward_cuda_kernel\", [&] {\n        assign_score_withk_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),\n                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),\n                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid AssignScoreWithKBackwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  at::cuda::CUDAGuard device_guard(grad_out.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));\n  dim3 threads1(THREADS_PER_BLOCK);\n  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));\n  dim3 threads2(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"assign_score_withk_points_backward_cuda_kernel\",\n      [&] {\n        assign_score_withk_points_backward_cuda_kernel<scalar_t>\n            <<<blocks1, threads1, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),\n                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),\n                grad_points.data_ptr<scalar_t>(),\n                grad_centers.data_ptr<scalar_t>());\n      });\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"assign_score_withk_scores_backward_cuda_kernel\",\n      [&] {\n        assign_score_withk_scores_backward_cuda_kernel<scalar_t>\n            <<<blocks2, threads2, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),\n                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),\n                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"ball_query_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,\n                                        float max_radius, int nsample,\n                                        const Tensor new_xyz, const Tensor xyz,\n                                        Tensor idx) {\n  // new_xyz: (B, M, 3)\n  // xyz: (B, N, 3)\n  // output:\n  //      idx: (B, M, nsample)\n\n  at::cuda::CUDAGuard device_guard(new_xyz.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      new_xyz.scalar_type(), \"ball_query_forward_cuda_kernel\", [&] {\n        ball_query_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, n, m, min_radius, max_radius, nsample,\n                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),\n                idx.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"bbox_overlaps_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\n// Disable fp16 on ROCm device\n#ifndef MMCV_WITH_HIP\n#if __CUDA_ARCH__ >= 530\ntemplate <>\n__global__ void bbox_overlaps_cuda_kernel<at::Half>(\n    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,\n    const int num_bbox1, const int num_bbox2, const int mode,\n    const bool aligned, const int offset) {\n  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),\n                                 reinterpret_cast<const __half*>(bbox2),\n                                 reinterpret_cast<__half*>(ious), num_bbox1,\n                                 num_bbox2, mode, aligned, offset);\n}\n\n#endif  // __CUDA_ARCH__ >= 530\n#endif  // MMCV_WITH_HIP\n\nvoid BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,\n                                    Tensor ious, const int mode,\n                                    const bool aligned, const int offset) {\n  int output_size = ious.numel();\n  int num_bbox1 = bboxes1.size(0);\n  int num_bbox2 = bboxes2.size(0);\n\n  at::cuda::CUDAGuard device_guard(bboxes1.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      bboxes1.scalar_type(), \"bbox_overlaps_cuda_kernel\", ([&] {\n        bbox_overlaps_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),\n                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,\n                offset);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"bezier_align_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                          Tensor output, int aligned_height,\n                                          int aligned_width,\n                                          float spatial_scale,\n                                          int sampling_ratio, bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"bezier_align_forward_cuda_kernel\", [&] {\n        bezier_align_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,\n                channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid BezierAlignBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,\n    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"bezier_align_backward_cuda_kernel\", [&] {\n        bezier_align_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,\n                channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu",
    "content": "// Modified from\n// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp\n\n// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n#include <c10/util/Half.h>\n#include <cuda_runtime.h>\n#include <torch/types.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n\nstruct bias_act_kernel_params {\n  const void *x;     // [sizeX]\n  const void *b;     // [sizeB] or NULL\n  const void *xref;  // [sizeX] or NULL\n  const void *yref;  // [sizeX] or NULL\n  const void *dy;    // [sizeX] or NULL\n  void *y;           // [sizeX]\n\n  int grad;\n  int act;\n  float alpha;\n  float gain;\n  float clamp;\n\n  int sizeX;\n  int sizeB;\n  int stepB;\n  int loopX;\n};\n\n// CUDA kernel selection.\n\ntemplate <class T>\nvoid *choose_bias_act_kernel(const bias_act_kernel_params &p);\n//------------------------------------------------------------------------\n// Helpers.\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n};\n\n//------------------------------------------------------------------------\n// CUDA kernel.\n\ntemplate <class T, int A>\n__global__ void bias_act_kernel(bias_act_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  int G = p.grad;\n  scalar_t alpha = (scalar_t)p.alpha;\n  scalar_t gain = (scalar_t)p.gain;\n  scalar_t clamp = (scalar_t)p.clamp;\n  scalar_t one = (scalar_t)1;\n  scalar_t two = (scalar_t)2;\n  scalar_t expRange = (scalar_t)80;\n  scalar_t halfExpRange = (scalar_t)40;\n  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;\n  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;\n\n  // Loop over elements.\n  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;\n  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;\n       loopIdx++, xi += blockDim.x) {\n    // Load.\n    scalar_t x = (scalar_t)((const T *)p.x)[xi];\n    scalar_t b =\n        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;\n    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;\n    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;\n    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;\n    scalar_t yy = (gain != 0) ? yref / gain : 0;\n    scalar_t y = 0;\n\n    // Apply bias.\n    ((G == 0) ? x : xref) += b;\n\n    // linear\n    if (A == 1) {\n      if (G == 0) y = x;\n      if (G == 1) y = x;\n    }\n\n    // relu\n    if (A == 2) {\n      if (G == 0) y = (x > 0) ? x : 0;\n      if (G == 1) y = (yy > 0) ? x : 0;\n    }\n\n    // lrelu\n    if (A == 3) {\n      if (G == 0) y = (x > 0) ? x : x * alpha;\n      if (G == 1) y = (yy > 0) ? x : x * alpha;\n    }\n\n    // tanh\n    if (A == 4) {\n      if (G == 0) {\n        scalar_t c = exp(x);\n        scalar_t d = one / c;\n        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);\n      }\n      if (G == 1) y = x * (one - yy * yy);\n      if (G == 2) y = x * (one - yy * yy) * (-two * yy);\n    }\n\n    // sigmoid\n    if (A == 5) {\n      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);\n      if (G == 1) y = x * yy * (one - yy);\n      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);\n    }\n\n    // elu\n    if (A == 6) {\n      if (G == 0) y = (x >= 0) ? x : exp(x) - one;\n      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);\n      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);\n    }\n\n    // selu\n    if (A == 7) {\n      if (G == 0)\n        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);\n      if (G == 1)\n        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);\n      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);\n    }\n\n    // softplus\n    if (A == 8) {\n      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);\n      if (G == 1) y = x * (one - exp(-yy));\n      if (G == 2) {\n        scalar_t c = exp(-yy);\n        y = x * c * (one - c);\n      }\n    }\n\n    // swish\n    if (A == 9) {\n      if (G == 0)\n        y = (x < -expRange) ? 0 : x / (exp(-x) + one);\n      else {\n        scalar_t c = exp(xref);\n        scalar_t d = c + one;\n        if (G == 1)\n          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);\n        else\n          y = (xref > halfExpRange)\n                  ? 0\n                  : x * c * (xref * (two - d) + two * d) / (d * d * d);\n        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;\n      }\n    }\n\n    // Apply gain.\n    y *= gain * dy;\n\n    // Clamp.\n    if (clamp >= 0) {\n      if (G == 0)\n        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;\n      else\n        y = (yref > -clamp & yref < clamp) ? y : 0;\n    }\n\n    // Store.\n    ((T *)p.y)[xi] = (T)y;\n  }\n}\n\n//------------------------------------------------------------------------\n// CUDA kernel selection.\n\ntemplate <class T>\nvoid *choose_bias_act_kernel(const bias_act_kernel_params &p) {\n  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;\n  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;\n  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;\n  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;\n  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;\n  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;\n  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;\n  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;\n  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;\n  return NULL;\n}\n\n//------------------------------------------------------------------------\n\nstatic bool has_same_layout(torch::Tensor x, torch::Tensor y) {\n  if (x.dim() != y.dim()) return false;\n  for (int64_t i = 0; i < x.dim(); i++) {\n    if (x.size(i) != y.size(i)) return false;\n    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;\n  }\n  return true;\n}\n\n//------------------------------------------------------------------------\ntorch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,\n                          const torch::Tensor &xref, const torch::Tensor &yref,\n                          const torch::Tensor &dy, int grad, int dim, int act,\n                          float alpha, float gain, float clamp) {\n  // Validate arguments.\n  TORCH_CHECK(x.is_cuda(), \"x must reside on CUDA device\");\n  TORCH_CHECK(\n      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),\n      \"b must have the same dtype and device as x\");\n  TORCH_CHECK(xref.numel() == 0 ||\n                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&\n                   xref.device() == x.device()),\n              \"xref must have the same shape, dtype, and device as x\");\n  TORCH_CHECK(yref.numel() == 0 ||\n                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&\n                   yref.device() == x.device()),\n              \"yref must have the same shape, dtype, and device as x\");\n  TORCH_CHECK(\n      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&\n                          dy.device() == x.device()),\n      \"dy must have the same dtype and device as x\");\n  TORCH_CHECK(x.numel() <= INT_MAX, \"x is too large\");\n  TORCH_CHECK(b.dim() == 1, \"b must have rank 1\");\n  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),\n              \"dim is out of bounds\");\n  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),\n              \"b has wrong number of elements\");\n  TORCH_CHECK(grad >= 0, \"grad must be non-negative\");\n\n  // Validate layout.\n  TORCH_CHECK(x.is_non_overlapping_and_dense(),\n              \"x must be non-overlapping and dense\");\n  TORCH_CHECK(b.is_contiguous(), \"b must be contiguous\");\n  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),\n              \"xref must have the same layout as x\");\n  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),\n              \"yref must have the same layout as x\");\n  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),\n              \"dy must have the same layout as x\");\n\n  // Create output tensor.\n  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));\n  torch::Tensor y = torch::empty_like(x);\n  TORCH_CHECK(has_same_layout(y, x), \"y must have the same layout as x\");\n\n  // Initialize CUDA kernel parameters.\n  bias_act_kernel_params p;\n  p.x = x.data_ptr();\n  p.b = (b.numel()) ? b.data_ptr() : NULL;\n  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;\n  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;\n  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;\n  p.y = y.data_ptr();\n  p.grad = grad;\n  p.act = act;\n  p.alpha = alpha;\n  p.gain = gain;\n  p.clamp = clamp;\n  p.sizeX = (int)x.numel();\n  p.sizeB = (int)b.numel();\n  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;\n\n  // Choose CUDA kernel.\n  void *kernel;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), \"upfirdn2d_cuda\", [&] {\n    kernel = choose_bias_act_kernel<scalar_t>(p);\n  });\n  TORCH_CHECK(kernel, \"no CUDA kernel found for the specified activation func\");\n\n  // Launch CUDA kernel.\n  p.loopX = 4;\n  int blockSize = 4 * 32;\n  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;\n  void *args[] = {&p};\n#ifdef MMCV_WITH_HIP\n  AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,\n                                at::cuda::getCurrentCUDAStream()));\n#else\n  AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0,\n                                 at::cuda::getCurrentCUDAStream()));\n#endif\n\n  return y;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"border_align_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid BorderAlignForwardCUDAKernelLauncher(const Tensor &input,\n                                          const Tensor &boxes, Tensor output,\n                                          Tensor argmax_idx,\n                                          const int pool_size) {\n  // shape assertion\n  AT_ASSERTM(input.ndimension() == 4,\n             \"non-empty 4D(batch mode) tensor expected for input feature\");\n  AT_ASSERTM(boxes.ndimension() == 3,\n             \"boxes must be 3D tensor with size of [B, H*W, 4]\");\n\n  int batch_size = input.size(0);\n  int feat_channels = input.size(1);\n  int channels = feat_channels / 4;\n  int height = input.size(2);\n  int width = input.size(3);\n  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format\n  int box_size = boxes.size(1);\n  // shape [N, channels, box_size, 4] for output\n  int nthreads = batch_size * channels * box_size;\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  dim3 block(128, 4);\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"border_align_forward_cuda_kernel\", [&] {\n        border_align_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(\n                nthreads, input.data_ptr<scalar_t>(),\n                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax_idx.data_ptr<int>(), channels, box_size, height, width,\n                pool_size);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,\n                                           const Tensor &boxes,\n                                           const Tensor &argmax_idx,\n                                           Tensor grad_input,\n                                           const int pool_size) {\n  int batch_size = grad_input.size(0);\n  int feat_channels = grad_input.size(1);\n  int channels = feat_channels / 4;\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n  int box_size = boxes.size(1);\n  int nthreads = batch_size * channels * box_size;\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  dim3 block(128, 4);\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"border_align_backward_cuda_kernel\", [&] {\n        border_align_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(\n                nthreads, grad_output.data_ptr<scalar_t>(),\n                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),\n                grad_input.data_ptr<scalar_t>(), channels, box_size, height,\n                width, pool_size);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"box_iou_quadri_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  using scalar_t = float;\n  AT_ASSERTM(boxes1.is_cuda(), \"boxes1 must be a CUDA tensor\");\n  AT_ASSERTM(boxes2.is_cuda(), \"boxes2 must be a CUDA tensor\");\n\n  int output_size = ious.numel();\n  int num_boxes1 = boxes1.size(0);\n  int num_boxes2 = boxes2.size(0);\n\n  at::cuda::CUDAGuard device_guard(boxes1.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  box_iou_quadri_cuda_kernel<scalar_t>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),\n          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),\n          mode_flag, aligned);\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu\n#include \"box_iou_rotated_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned) {\n  using scalar_t = float;\n  AT_ASSERTM(boxes1.is_cuda(), \"boxes1 must be a CUDA tensor\");\n  AT_ASSERTM(boxes2.is_cuda(), \"boxes2 must be a CUDA tensor\");\n\n  int output_size = ious.numel();\n  int num_boxes1 = boxes1.size(0);\n  int num_boxes2 = boxes2.size(0);\n\n  at::cuda::CUDAGuard device_guard(boxes1.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  box_iou_rotated_cuda_kernel<scalar_t>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),\n          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),\n          mode_flag, aligned);\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"carafe_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,\n                                     Tensor rfeatures, Tensor routput,\n                                     Tensor rmasks, Tensor output,\n                                     const int kernel_size,\n                                     const int group_size,\n                                     const int scale_factor) {\n  const int batch_size = output.size(0);\n  const int channels = output.size(1);\n  const int output_height = output.size(2);\n  const int output_width = output.size(3);\n\n  const int input_height = features.size(2);\n  const int input_width = features.size(3);\n\n  const int mask_channels = masks.size(1);\n\n  rfeatures.resize_({batch_size, input_height, input_width, channels});\n  routput.resize_({batch_size, output_height, output_width, channels});\n  rmasks.resize_({batch_size, output_height, output_width, mask_channels});\n\n  // one warp per pixel\n  at::cuda::CUDAGuard device_guard(features.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NCHW2NHWC_Feature\", ([&] {\n        const scalar_t *bottom_data = features.data_ptr<scalar_t>();\n        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();\n        const int dh = divideUP(channels, kTileDim);\n        const int dw = divideUP(input_height * input_width, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, channels, input_height * input_width, dh, dw,\n                bottom_data, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NCHW2NHWC_Masks\", ([&] {\n        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();\n        scalar_t *top_data = rmasks.data_ptr<scalar_t>();\n        const int dh = divideUP(mask_channels, kTileDim);\n        const int dw = divideUP(output_height * output_width, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, mask_channels, output_height * output_width, dh, dw,\n                bottom_data, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"CARAFELaucherForward\", ([&] {\n        const int num_kernels =\n            batch_size * output_height * output_width * THREADS_PER_PIXEL;\n        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();\n        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();\n        scalar_t *top_data = routput.data_ptr<scalar_t>();\n\n        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),\n                                  THREADS_PER_BLOCK, 0, stream>>>(\n            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,\n            scale_factor, channels, input_height, input_width, output_height,\n            output_width, mask_channels, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NHWC2NCHW\", ([&] {\n        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n        const int dh = divideUP(output_height * output_width, kTileDim);\n        const int dw = divideUP(channels, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, output_height * output_width, channels, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid CARAFEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,\n    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,\n    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,\n    const int kernel_size, const int group_size, const int scale_factor) {\n  const int batch_size = top_grad.size(0);\n  const int channels = top_grad.size(1);\n  const int output_height = top_grad.size(2);\n  const int output_width = top_grad.size(3);\n\n  const int input_height = bottom_grad.size(2);\n  const int input_width = bottom_grad.size(3);\n\n  const int mask_channels = masks.size(1);\n\n  rtop_grad.resize_({batch_size, output_height, output_width, channels});\n  rbottom_grad.resize_({batch_size, input_height, input_width, channels});\n  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});\n  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});\n\n  at::cuda::CUDAGuard device_guard(top_grad.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NCHW2NHWC_Top_Grad\", ([&] {\n        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(channels, kTileDim);\n        const int dw = divideUP(output_height * output_width, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, channels, output_height * output_width, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"CARAFELaucherBackward_Feature\", ([&] {\n        const int num_kernels =\n            batch_size * output_height * output_width * THREADS_PER_PIXEL;\n        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();\n        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();\n\n        CARAFEBackward_Feature<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,\n                         group_size, scale_factor, channels, input_height,\n                         input_width, output_height, output_width,\n                         mask_channels, bottom_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"FeatureSum\", ([&] {\n        const int num_kernels =\n            batch_size * input_height * input_width * THREADS_PER_PIXEL;\n        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();\n\n        FeatureSum<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,\n                         input_height, input_width, bottom_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NHWC2NCHW_Bottom_Grad\", ([&] {\n        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(input_height * input_width, kTileDim);\n        const int dw = divideUP(channels, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, input_height * input_width, channels, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"CARAFELaucherBackward_Mask\", ([&] {\n        const int num_kernels = batch_size * output_height * output_width *\n                                mask_channels * WARP_SIZE;\n        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();\n        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();\n        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();\n\n        CARAFEBackward_Mask<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,\n                         group_size, scale_factor, channels, input_height,\n                         input_width, output_height, output_width,\n                         mask_channels, mask_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NHWC2NCHW_Mask_Grad\", ([&] {\n        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(output_height * output_width, kTileDim);\n        const int dw = divideUP(mask_channels, kTileDim);\n        BatchTranspose2DCUDAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, output_height * output_width, mask_channels, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"carafe_naive_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,\n                                          const Tensor masks, Tensor output,\n                                          const int kernel_size,\n                                          const int group_size,\n                                          const int scale_factor) {\n  int output_size = output.numel();\n  int channels = output.size(1);\n  int height = output.size(2);\n  int width = output.size(3);\n\n  at::cuda::CUDAGuard device_guard(features.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"CARAFENAIVEForward\", ([&] {\n        carafe_naive_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, features.data_ptr<scalar_t>(),\n                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                kernel_size, group_size, scale_factor, channels, height, width);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid CARAFENAIVEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor features, const Tensor masks,\n    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,\n    const int group_size, const int scale_factor) {\n  int output_size = top_grad.numel();\n  int channels = top_grad.size(1);\n  int height = top_grad.size(2);\n  int width = top_grad.size(3);\n\n  at::cuda::CUDAGuard device_guard(top_grad.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"CARAFENAIVEBackward\", ([&] {\n        carafe_naive_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, top_grad.data_ptr<scalar_t>(),\n                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),\n                bottom_grad.data_ptr<scalar_t>(),\n                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,\n                scale_factor, channels, height, width);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp\n#include \"chamfer_distance_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid ChamferDistanceForwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,\n    const Tensor dist2, const Tensor idx1, const Tensor idx2) {\n  int batch_size = xyz1.size(0);\n  int n = xyz1.size(1);\n  int m = xyz2.size(1);\n\n  at::cuda::CUDAGuard device_guard(xyz1.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_forward_cuda_kernel\", [&] {\n        chamfer_distance_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(\n                batch_size, n, xyz1.data_ptr<scalar_t>(), m,\n                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),\n                idx1.data_ptr<int>());\n      });\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_forward_cuda_kernel\", [&] {\n        chamfer_distance_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(\n                batch_size, m, xyz2.data_ptr<scalar_t>(), n,\n                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),\n                idx2.data_ptr<int>());\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ChamferDistanceBackwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,\n    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {\n  int batch_size = xyz1.size(0);\n  int n = xyz1.size(1);\n  int m = xyz2.size(1);\n\n  at::cuda::CUDAGuard device_guard(xyz1.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_backward_cuda_kernel\", [&] {\n        chamfer_distance_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                batch_size, m, xyz1.data_ptr<scalar_t>(), n,\n                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),\n                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),\n                grad_xyz2.data_ptr<scalar_t>());\n      });\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_backward_cuda_kernel\", [&] {\n        chamfer_distance_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                batch_size, n, xyz2.data_ptr<scalar_t>(), m,\n                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),\n                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),\n                grad_xyz1.data_ptr<scalar_t>());\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/convex_iou.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu\n#include \"convex_iou_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                 Tensor ious) {\n  int output_size = ious.numel();\n  int num_pointsets = pointsets.size(0);\n  int num_polygons = polygons.size(0);\n\n  at::cuda::CUDAGuard device_guard(pointsets.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      pointsets.scalar_type(), \"convex_iou_cuda_kernel\", ([&] {\n        convex_iou_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                  Tensor output) {\n  int output_size = output.numel();\n  int num_pointsets = pointsets.size(0);\n  int num_polygons = polygons.size(0);\n\n  at::cuda::CUDAGuard device_guard(pointsets.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      pointsets.scalar_type(), \"convex_giou_cuda_kernel\", ([&] {\n        convex_giou_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu\n// Original licence: Under MIT License\n\n#include \"correlation_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,\n                                          Tensor output, int kH, int kW,\n                                          int patchH, int patchW, int padH,\n                                          int padW, int dilationH,\n                                          int dilationW, int dilation_patchH,\n                                          int dilation_patchW, int dH, int dW) {\n  const int batch_size = input1.size(0);\n  const int iH = input1.size(2);\n  const int iW = input1.size(3);\n  const int dilatedKH = (kH - 1) * dilationH + 1;\n  const int dilatedKW = (kW - 1) * dilationW + 1;\n\n  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;\n  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;\n\n  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();\n  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();\n\n  const dim3 threads(WARP_SIZE, 4, 4);\n  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);\n\n  at::cuda::CUDAGuard device_guard(input1.device());\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input1.scalar_type(), \"correlation_forward_cuda\", ([&] {\n        TensorAcc4R trInput1_acc =\n            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R trInput2_acc =\n            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc5R output_acc =\n            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();\n\n        correlation_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(\n                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,\n                padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW, oH, oW);\n      }));\n}\n\nvoid CorrelationBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,\n    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,\n    int padW, int dilationH, int dilationW, int dilation_patchH,\n    int dilation_patchW, int dH, int dW) {\n  const int batch_size = input1.size(0);\n  const int iH = input1.size(2);\n  const int iW = input1.size(3);\n  const int C = input1.size(1);\n\n  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();\n  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();\n  const dim3 blocks(batch_size, iH, iW);\n  const dim3 threads(THREADS_PER_BLOCK);\n\n  at::cuda::CUDAGuard device_guard(input1.device());\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input1.scalar_type(), \"correlation_backward_cuda\", ([&] {\n        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);\n        TensorAcc4R input1_acc =\n            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R input2_acc =\n            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R grad_input1_acc =\n            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R grad_input2_acc =\n            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc5R grad_output_acc =\n            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();\n\n        correlation_backward_cuda_kernel_input1<scalar_t>\n            <<<blocks, threads, grad_cache_size,\n               at::cuda::getCurrentCUDAStream()>>>(\n                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,\n                patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW);\n\n        correlation_backward_cuda_kernel_input2<scalar_t>\n            <<<blocks, threads, grad_cache_size,\n               at::cuda::getCurrentCUDAStream()>>>(\n                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,\n                patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW);\n      }));\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/cudabind.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid AssignScoreWithKForwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &points, const Tensor &centers, const Tensor &scores,\n    const Tensor &knn_idx, Tensor &output);\n\nvoid AssignScoreWithKBackwardCUDAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores);\n\nvoid assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor &points,\n                                     const Tensor &centers,\n                                     const Tensor &scores,\n                                     const Tensor &knn_idx, Tensor &output) {\n  AssignScoreWithKForwardCUDAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);\n};\n\nvoid assign_score_withk_backward_cuda(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores) {\n  AssignScoreWithKBackwardCUDAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,\n      grad_points, grad_centers, grad_scores);\n};\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor &points,\n                                     const Tensor &centers,\n                                     const Tensor &scores,\n                                     const Tensor &knn_idx, Tensor &output);\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores);\n\nREGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,\n                     assign_score_withk_forward_cuda);\nREGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,\n                     assign_score_withk_backward_cuda);\n\nvoid BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,\n                                        float max_radius, int nsample,\n                                        const Tensor new_xyz, const Tensor xyz,\n                                        Tensor idx);\n\nvoid ball_query_forward_cuda(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx) {\n  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,\n                                     new_xyz, xyz, idx);\n};\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx);\nREGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);\n\nvoid StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,\n                                             const Tensor new_xyz,\n                                             const Tensor new_xyz_batch_cnt,\n                                             const Tensor xyz,\n                                             const Tensor xyz_batch_cnt,\n                                             Tensor idx);\n\nvoid stack_ball_query_forward_cuda(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx) {\n  StackBallQueryForwardCUDAKernelLauncher(\n      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);\n};\n\nvoid stack_ball_query_forward_impl(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx);\nREGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, CUDA,\n                     stack_ball_query_forward_cuda);\n\nvoid BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,\n                                    Tensor ious, const int mode,\n                                    const bool aligned, const int offset);\n\nvoid bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset) {\n  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);\n\nvoid BorderAlignForwardCUDAKernelLauncher(const Tensor &input,\n                                          const Tensor &boxes, Tensor output,\n                                          Tensor argmax_idx,\n                                          const int pool_size);\n\nvoid BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,\n                                           const Tensor &boxes,\n                                           const Tensor &argmax_idx,\n                                           Tensor grad_input,\n                                           const int pool_size);\n\nvoid border_align_forward_cuda(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size) {\n  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,\n                                       pool_size);\n}\n\nvoid border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size) {\n  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,\n                                        grad_input, pool_size);\n}\n\nvoid border_align_forward_impl(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size);\n\nvoid border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size);\n\nREGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,\n                     border_align_forward_cuda);\nREGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,\n                     border_align_backward_cuda);\n\nvoid box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);\n\nvoid box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\n\nvoid box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_quadri_impl, CUDA, box_iou_quadri_cuda);\n\nvoid CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,\n                                     Tensor rfeatures, Tensor routput,\n                                     Tensor rmasks, Tensor output,\n                                     const int kernel_size,\n                                     const int group_size,\n                                     const int scale_factor);\n\nvoid CARAFEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,\n    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,\n    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,\n    const int kernel_size, const int group_size, const int scale_factor);\n\nvoid carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor) {\n  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,\n                                  output, kernel_size, group_size,\n                                  scale_factor);\n}\n\nvoid carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor) {\n  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,\n                                   rbottom_grad_hs, rbottom_grad, rmask_grad,\n                                   bottom_grad, mask_grad, kernel_size,\n                                   group_size, scale_factor);\n}\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);\nREGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);\n\nvoid CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,\n                                          const Tensor masks, Tensor output,\n                                          const int kernel_size,\n                                          const int group_size,\n                                          const int scale_factor);\n\nvoid CARAFENAIVEBackwardCUDAKernelLauncher(\n    const Tensor top_grad, const Tensor features, const Tensor masks,\n    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,\n    const int group_size, const int scale_factor);\n\nvoid carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor) {\n  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,\n                                       group_size, scale_factor);\n}\n\nvoid carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor) {\n  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,\n                                        mask_grad, kernel_size, group_size,\n                                        scale_factor);\n}\nvoid carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor);\n\nvoid carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,\n                     carafe_naive_forward_cuda);\nREGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,\n                     carafe_naive_backward_cuda);\n\nvoid CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,\n                                          Tensor output, int kH, int kW,\n                                          int patchH, int patchW, int padH,\n                                          int padW, int dilationH,\n                                          int dilationW, int dilation_patchH,\n                                          int dilation_patchW, int dH, int dW);\n\nvoid CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,\n                                           Tensor input2, Tensor grad_input1,\n                                           Tensor grad_input2, int kH, int kW,\n                                           int patchH, int patchW, int padH,\n                                           int padW, int dilationH,\n                                           int dilationW, int dilation_patchH,\n                                           int dilation_patchW, int dH, int dW);\n\nvoid correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW) {\n  CorrelationForwardCUDAKernelLauncher(\n      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,\n      dilationW, dilation_patchH, dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW) {\n  CorrelationBackwardCUDAKernelLauncher(\n      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,\n      patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW);\n}\n\nvoid correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW);\n\nvoid correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW);\n\nREGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);\nREGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,\n                     correlation_backward_cuda);\n\nvoid deformable_im2col_cuda(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_cuda(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_cuda(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nREGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);\nREGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);\nREGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,\n                     deformable_col2im_coord_cuda);\n\nvoid DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                            Tensor offset, Tensor output,\n                                            int pooled_height, int pooled_width,\n                                            float spatial_scale,\n                                            int sampling_ratio, float gamma);\n\nvoid DeformRoIPoolBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,\n                                         pooled_height, pooled_width,\n                                         spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma) {\n  DeformRoIPoolBackwardCUDAKernelLauncher(\n      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,\n      pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n\nREGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,\n                     deform_roi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,\n                     deform_roi_pool_backward_cuda);\n\nvoid SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                                Tensor weight,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                                Tensor weight, Tensor buff,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha) {\n  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,\n                                             gamma, alpha);\n}\n\nvoid softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_cuda(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha) {\n  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,\n                                             grad_input, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha);\n\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,\n                     sigmoid_focal_loss_forward_cuda);\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,\n                     sigmoid_focal_loss_backward_cuda);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,\n                     softmax_focal_loss_forward_cuda);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,\n                     softmax_focal_loss_backward_cuda);\n\nvoid FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,\n                                                    const float *dataset,\n                                                    float *temp, int *idxs);\n\nvoid FurthestPointSamplingWithDistForwardCUDAKernelLauncher(\n    int b, int n, int m, const float *dataset, float *temp, int *idxs);\n\nvoid furthest_point_sampling_forward_cuda(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m) {\n  const float *dataset = points_tensor.data_ptr<float>();\n  float *temp = temp_tensor.data_ptr<float>();\n  int *idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);\n}\n\nvoid furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m) {\n  const float *dataset = points_tensor.data_ptr<float>();\n  float *temp = temp_tensor.data_ptr<float>();\n  int *idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,\n                                                         idxs);\n}\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m);\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m);\n\nREGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,\n                     furthest_point_sampling_forward_cuda);\nREGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,\n                     furthest_point_sampling_with_dist_forward_cuda);\n\ntorch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,\n                                      const torch::Tensor &bias,\n                                      const torch::Tensor &refer, int act,\n                                      int grad, float alpha, float scale);\n\ntorch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,\n                                           const torch::Tensor &bias,\n                                           const torch::Tensor &refer, int act,\n                                           int grad, float alpha, float scale);\nREGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,\n                     fused_bias_leakyrelu_op);\n\ntorch::Tensor bias_act_op_impl(const torch::Tensor &input,\n                               const torch::Tensor &bias,\n                               const torch::Tensor &xref,\n                               const torch::Tensor &yref,\n                               const torch::Tensor &dy, int grad, int dim,\n                               int act, float alpha, float gain, float clamp);\n\ntorch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,\n                          const torch::Tensor &xref, const torch::Tensor &yref,\n                          const torch::Tensor &dy, int grad, int dim, int act,\n                          float alpha, float gain, float clamp);\n\nREGISTER_DEVICE_IMPL(bias_act_op_impl, CUDA, bias_act_op);\n\ntorch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,\n                                         int sx, int sy, float gain,\n                                         float slope, float clamp,\n                                         bool writeSigns);\n\ntorch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,\n                                    int sy, float gain, float slope,\n                                    float clamp, bool writeSigns);\n\nREGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, CUDA, filtered_lrelu_act_op);\n\nvoid GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           const Tensor points,\n                                           const Tensor idx, Tensor out);\n\nvoid GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                            const Tensor grad_out,\n                                            const Tensor idx,\n                                            Tensor grad_points);\n\nvoid gather_points_forward_cuda(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out) {\n  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);\n};\n\nvoid gather_points_backward_cuda(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points) {\n  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,\n                                         grad_points);\n};\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out);\n\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,\n                     gather_points_forward_cuda);\nREGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,\n                     gather_points_backward_cuda);\n\nvoid GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                          int nsample, const Tensor points,\n                                          const Tensor idx, Tensor out);\n\nvoid GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           int nsample, const Tensor grad_out,\n                                           const Tensor idx,\n                                           Tensor grad_points);\n\nvoid group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,\n                                       out);\n};\n\nvoid group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,\n                                        idx, grad_points);\n};\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out);\n\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,\n                     group_points_forward_cuda);\nREGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,\n                     group_points_backward_cuda);\n\nvoid StackGroupPointsForwardCUDAKernelLauncher(\n    int b, int c, int m, int nsample, const Tensor features_tensor,\n    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,\n    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);\nvoid StackGroupPointsBackwardCUDAKernelLauncher(\n    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,\n    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,\n    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);\n\nvoid stack_group_points_forward_cuda(int b, int c, int m, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor) {\n  StackGroupPointsForwardCUDAKernelLauncher(\n      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,\n      idx_batch_cnt_tensor, out_tensor);\n};\n\nvoid stack_group_points_backward_cuda(int b, int c, int m, int n, int nsample,\n                                      const Tensor grad_out_tensor,\n                                      const Tensor idx_tensor,\n                                      const Tensor idx_batch_cnt_tensor,\n                                      const Tensor features_batch_cnt_tensor,\n                                      Tensor grad_features_tensor) {\n  StackGroupPointsBackwardCUDAKernelLauncher(\n      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,\n      features_batch_cnt_tensor, grad_features_tensor);\n};\n\nvoid stack_group_points_forward_impl(int b, int c, int m, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor);\n\nvoid stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,\n                                      const Tensor grad_out_tensor,\n                                      const Tensor idx_tensor,\n                                      const Tensor idx_batch_cnt_tensor,\n                                      const Tensor features_batch_cnt_tensor,\n                                      Tensor grad_features_tensor);\n\nREGISTER_DEVICE_IMPL(stack_group_points_forward_impl, CUDA,\n                     stack_group_points_forward_cuda);\nREGISTER_DEVICE_IMPL(stack_group_points_backward_impl, CUDA,\n                     stack_group_points_backward_cuda);\n\nvoid IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,\n                                                   const Tensor boxes_a,\n                                                   const int num_b,\n                                                   const Tensor boxes_b,\n                                                   Tensor ans_overlap);\n\nvoid IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,\n                                         Tensor &keep_num,\n                                         float nms_overlap_thresh);\n\nvoid IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,\n                                               Tensor &keep_num,\n                                               float nms_overlap_thresh);\n\nvoid iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap) {\n  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,\n                                                ans_overlap);\n};\n\nvoid iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh) {\n  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,\n                                      nms_overlap_thresh);\n};\n\nvoid iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh) {\n  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,\n                                            nms_overlap_thresh);\n};\n\nvoid iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap);\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh);\n\nvoid iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh);\n\nREGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,\n                     iou3d_boxes_overlap_bev_forward_cuda);\nREGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);\nREGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,\n                     iou3d_nms3d_normal_forward_cuda);\n\nvoid KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,\n                                  const Tensor xyz, const Tensor new_xyz,\n                                  Tensor idx, Tensor dist2);\n\nvoid knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);\n}\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2);\nREGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);\n\nvoid MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int kernel_h,\n                                           const int kernel_w, const int pad_h,\n                                           const int pad_w);\n\nvoid MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int height,\n                                           const int width, const int channels);\n\nvoid masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)\n  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,\n                                        kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)\n  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,\n                                        width, channels);\n}\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels);\n\nREGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,\n                     masked_im2col_forward_cuda);\nREGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,\n                     masked_col2im_forward_cuda);\n\nvoid modulated_deformable_im2col_cuda(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_cuda(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_cuda(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nREGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,\n                     modulated_deformable_im2col_cuda);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,\n                     modulated_deformable_col2im_cuda);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,\n                     modulated_deformable_col2im_coord_cuda);\n\nTensor ms_deform_attn_cuda_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_cuda_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);\n\nTensor ms_deform_attn_impl_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);\n\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,\n                     ms_deform_attn_cuda_forward);\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,\n                     ms_deform_attn_cuda_backward);\n\nTensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                             int offset);\n\nTensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\nREGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);\n\nvoid PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                                int pts_num, const Tensor boxes,\n                                                const Tensor pts,\n                                                Tensor box_idx_of_points);\n\nvoid PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                               int pts_num, const Tensor boxes,\n                                               const Tensor pts,\n                                               Tensor box_idx_of_points);\n\nvoid points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points) {\n  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,\n                                             boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points) {\n  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,\n                                            boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points);\n\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points);\nREGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,\n                     points_in_boxes_part_forward_cuda);\nREGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,\n                     points_in_boxes_all_forward_cuda);\n\nvoid PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,\n                                      Tensor output, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask);\n\nvoid PSAMaskBackwardCUDAKernelLauncher(\n    const int psa_type, const Tensor grad_output, Tensor grad_input,\n    const int num_, const int h_feature, const int w_feature, const int h_mask,\n    const int w_mask, const int half_h_mask, const int half_w_mask);\n\nvoid psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,\n                                   w_feature, h_mask, w_mask, half_h_mask,\n                                   half_w_mask);\n}\n\nvoid psamask_backward_cuda(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask) {\n  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,\n                                    h_feature, w_feature, h_mask, w_mask,\n                                    half_h_mask, half_w_mask);\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\nREGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);\nREGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);\n\nvoid ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned);\n\nvoid ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                        Tensor argmax_y, Tensor argmax_x,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, int pool_mode,\n                                        bool aligned);\n\nvoid roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignForwardCUDAKernelLauncher(\n      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,\n      spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  ROIAlignBackwardCUDAKernelLauncher(\n      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,\n      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);\n\nvoid ROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor input, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor output);\n\nvoid ROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);\n\nvoid roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = input.size(1);\n  int data_height = input.size(2);\n  int data_width = input.size(3);\n  ROIAlignRotatedForwardCUDAKernelLauncher(\n      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, output);\n}\n\nvoid roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = bottom_grad.size(1);\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  ROIAlignRotatedBackwardCUDAKernelLauncher(\n      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, bottom_grad);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\nREGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,\n                     roi_align_rotated_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,\n                     roi_align_rotated_backward_cuda);\n\nvoid RiROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor features, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor output);\n\nvoid RiROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor bottom_grad);\n\nvoid riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(features);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = features.size(1) / num_orientations;\n  int data_height = features.size(2);\n  int data_width = features.size(3);\n  RiROIAlignRotatedForwardCUDAKernelLauncher(\n      features, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, output);\n}\n\nvoid riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(top_grad);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = bottom_grad.size(1) / num_orientations;\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  RiROIAlignRotatedBackwardCUDAKernelLauncher(\n      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, bottom_grad);\n}\n\nvoid riroi_align_rotated_forward_impl(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise);\n\nvoid riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise);\n\nREGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,\n                     riroi_align_rotated_forward_cuda);\nREGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,\n                     riroi_align_rotated_backward_cuda);\n\nvoid RoiawarePool3dForwardCUDAKernelLauncher(\n    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,\n    int out_y, int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,\n    Tensor pooled_features, int pool_method);\n\nvoid RoiawarePool3dBackwardCUDAKernelLauncher(\n    int boxes_num, int out_x, int out_y, int out_z, int channels,\n    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,\n    const Tensor grad_out, Tensor grad_in, int pool_method);\n\nvoid roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method) {\n  RoiawarePool3dForwardCUDAKernelLauncher(\n      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,\n      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,\n      pool_method);\n};\n\nvoid roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method) {\n  RoiawarePool3dBackwardCUDAKernelLauncher(\n      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,\n      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);\n};\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method);\n\nREGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,\n                     roiaware_pool3d_forward_cuda);\nREGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,\n                     roiaware_pool3d_backward_cuda);\n\nvoid RoIPointPool3dForwardCUDAKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);\n\nvoid roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag) {\n  RoIPointPool3dForwardCUDAKernelLauncher(\n      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,\n      boxes3d, pts_feature, pooled_features, pooled_empty_flag);\n};\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag);\nREGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,\n                     roipoint_pool3d_forward_cuda);\n\nvoid ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax, int pooled_height,\n                                      int pooled_width, float spatial_scale);\n\nvoid ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                       Tensor argmax, Tensor grad_input,\n                                       int pooled_height, int pooled_width,\n                                       float spatial_scale);\n\nvoid roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale) {\n  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,\n                                   pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale) {\n  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,\n                                    pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\nREGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\nstd::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(\n    const at::Tensor &feats, const at::Tensor &coors,\n    const reduce_t reduce_type);\n\nvoid DynamicPointToVoxelBackwardCUDAKernelLauncher(\n    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,\n    const at::Tensor &feats, const at::Tensor &reduced_feats,\n    const at::Tensor &coors_map, const at::Tensor &reduce_count,\n    const reduce_t reduce_type);\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const reduce_t reduce_type) {\n  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,\n                                                      reduce_type);\n};\n\nvoid dynamic_point_to_voxel_backward_cuda(\n    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,\n    const torch::Tensor &feats, const torch::Tensor &reduced_feats,\n    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,\n    const reduce_t reduce_type) {\n  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,\n                                                feats, reduced_feats, coors_idx,\n                                                reduce_count, reduce_type);\n};\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const reduce_t reduce_type);\n\nvoid dynamic_point_to_voxel_backward_impl(\n    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,\n    const torch::Tensor &feats, const torch::Tensor &reduced_feats,\n    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,\n    const reduce_t reduce_type);\n\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,\n                     dynamic_point_to_voxel_forward_cuda);\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,\n                     dynamic_point_to_voxel_backward_cuda);\n\nvoid SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);\n\nvoid SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,\n                                        Tensor var);\n\nvoid SyncBNForwardOutputCUDAKernelLauncher(\n    const Tensor input, const Tensor mean, const Tensor var,\n    Tensor running_mean, Tensor running_var, const Tensor weight,\n    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,\n    float momentum, int group_size);\n\nvoid SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,\n                                           const Tensor norm,\n                                           Tensor grad_weight,\n                                           Tensor grad_bias);\n\nvoid SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,\n                                          const Tensor weight,\n                                          const Tensor grad_weight,\n                                          const Tensor grad_bias,\n                                          const Tensor norm, const Tensor std,\n                                          Tensor grad_input);\n\nvoid sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {\n  SyncBNForwardMeanCUDAKernelLauncher(input, mean);\n}\n\nvoid sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,\n                              Tensor var) {\n  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);\n}\n\nvoid sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size) {\n  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,\n                                        running_var, weight, bias, norm, std,\n                                        output, eps, momentum, group_size);\n}\n\nvoid sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias) {\n  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,\n                                        grad_bias);\n}\n\nvoid sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input) {\n  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,\n                                       grad_bias, norm, std, grad_input);\n}\n\nvoid sync_bn_forward_mean_impl(const Tensor input, Tensor mean);\n\nvoid sync_bn_forward_var_impl(const Tensor input, const Tensor mean,\n                              Tensor var);\n\nvoid sync_bn_forward_output_impl(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size);\n\nvoid sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias);\n\nvoid sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input);\n\nREGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,\n                     sync_bn_forward_mean_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,\n                     sync_bn_forward_output_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,\n                     sync_bn_backward_param_cuda);\nREGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,\n                     sync_bn_backward_data_cuda);\n\nvoid ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,\n                                               const Tensor points,\n                                               const Tensor idx,\n                                               const Tensor weight, Tensor out);\n\nvoid ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,\n                                                const Tensor grad_out,\n                                                const Tensor idx,\n                                                const Tensor weight,\n                                                Tensor grad_points);\n\nvoid three_interpolate_forward_cuda(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out) {\n  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,\n                                            out);\n};\n\nvoid three_interpolate_backward_cuda(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points) {\n  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,\n                                             grad_points);\n};\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out);\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points);\nREGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,\n                     three_interpolate_forward_cuda);\nREGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,\n                     three_interpolate_backward_cuda);\n\nvoid ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,\n                                      const Tensor known, Tensor dist2,\n                                      Tensor idx);\n\nvoid three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx) {\n  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);\n};\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx);\nREGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);\n\nvoid TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,\n                                       Tensor output);\n\nvoid TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,\n                                        Tensor grad_input);\n\nvoid tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {\n  TINShiftForwardCUDAKernelLauncher(input, shift, output);\n}\n\nvoid tin_shift_backward_cuda(Tensor grad_output, Tensor shift,\n                             Tensor grad_input) {\n  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input);\nREGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);\nREGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);\n\ntorch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,\n                           int upy, int downx, int downy, int padx0, int padx1,\n                           int pady0, int pady1, bool flip, float gain);\n\ntorch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,\n                                int upx, int upy, int downx, int downy,\n                                int padx0, int padx1, int pady0, int pady1,\n                                bool flip, float gain);\nREGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);\n\nint HardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nint NondeterministicHardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nvoid DynamicVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &coors,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const int NDim = 3);\n\nint hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim) {\n  return HardVoxelizeForwardCUDAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nint nondeterministic_hard_voxelize_forward_cuda(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim) {\n  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nvoid dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim) {\n  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,\n                                           coors_range, NDim);\n};\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim);\n\nint nondeterministic_hard_voxelize_forward_impl(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim);\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim);\n\nREGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,\n                     hard_voxelize_forward_cuda);\nREGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,\n                     nondeterministic_hard_voxelize_forward_cuda);\nREGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,\n                     dynamic_voxelize_forward_cuda);\n\nvoid RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor output);\n\nvoid RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,\n                                                   const Tensor best_bboxes,\n                                                   const float spatial_scale,\n                                                   const int points,\n                                                   Tensor bottom_grad);\n\nvoid rotated_feature_align_forward_cuda(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output) {\n  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,\n                                               spatial_scale, points, output);\n};\n\nvoid rotated_feature_align_backward_cuda(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad) {\n  RotatedFeatureAlignBackwardCUDAKernelLauncher(\n      top_grad, best_bboxes, spatial_scale, points, bottom_grad);\n};\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,\n                     rotated_feature_align_forward_cuda);\nREGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,\n                     rotated_feature_align_backward_cuda);\n\nvoid PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,\n                                               const at::Tensor polygons,\n                                               const int rows, const int cols,\n                                               at::Tensor output);\n\nvoid points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols) {\n  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,\n                                            output);\n};\n\nvoid points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols);\n\nREGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,\n                     points_in_polygons_forward_cuda);\n\ntorch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,\n                                                     torch::Tensor indicePairs,\n                                                     torch::Tensor indiceNum,\n                                                     int64_t numAct);\n\ntorch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,\n                                          torch::Tensor indicePairs,\n                                          torch::Tensor indiceNum,\n                                          int64_t numAct) {\n  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,\n                                                indiceNum, numAct);\n};\n\ntorch::Tensor indice_maxpool_forward_impl(torch::Tensor features,\n                                          torch::Tensor indicePairs,\n                                          torch::Tensor indiceNum,\n                                          int64_t numAct);\nREGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,\n                     indice_maxpool_forward_cuda);\n\ntorch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,\n                                                      torch::Tensor outFeatures,\n                                                      torch::Tensor outGrad,\n                                                      torch::Tensor indicePairs,\n                                                      torch::Tensor indiceNum);\n\ntorch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,\n                                           torch::Tensor outFeatures,\n                                           torch::Tensor outGrad,\n                                           torch::Tensor indicePairs,\n                                           torch::Tensor indiceNum) {\n  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,\n                                                 indicePairs, indiceNum);\n};\n\ntorch::Tensor indice_maxpool_backward_impl(torch::Tensor features,\n                                           torch::Tensor outFeatures,\n                                           torch::Tensor outGrad,\n                                           torch::Tensor indicePairs,\n                                           torch::Tensor indiceNum);\n\nREGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,\n                     indice_maxpool_backward_cuda)\n\ntorch::Tensor IndiceConvForwardCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,\n    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,\n    int64_t _subM);\n\ntorch::Tensor indice_conv_forward_cuda(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM) {\n  return IndiceConvForwardCUDAKernelLauncher(\n      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);\n};\n\ntorch::Tensor indice_conv_forward_impl(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM);\n\nREGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);\n\nstd::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM);\n\nstd::vector<torch::Tensor> indice_conv_backward_cuda(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  return IndiceConvBackwardCUDAKernelLauncher(\n      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);\n};\n\nstd::vector<torch::Tensor> indice_conv_backward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM);\n\nREGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,\n                     indice_conv_backward_cuda);\n\ntorch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM);\n\ntorch::Tensor fused_indice_conv_batchnorm_forward_cuda(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,\n                                                    indicePairs, indiceNum,\n                                                    numActOut, _inverse, _subM);\n};\n\ntorch::Tensor fused_indice_conv_batchnorm_forward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM);\n\nREGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,\n                     fused_indice_conv_batchnorm_forward_cuda)\n\nvoid MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);\n\nvoid min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {\n  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);\n}\n\nvoid min_area_polygons_impl(const Tensor pointsets, Tensor polygons);\n\nREGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);\n\nvoid ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,\n                                                  const Tensor indices,\n                                                  Tensor output);\n\nvoid ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,\n                                                   const Tensor indices,\n                                                   Tensor grad_in);\n\nvoid active_rotated_filter_forward_cuda(const Tensor input,\n                                        const Tensor indices, Tensor output) {\n  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);\n};\n\nvoid active_rotated_filter_backward_cuda(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in) {\n  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);\n};\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output);\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in);\n\nREGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,\n                     active_rotated_filter_forward_cuda);\nREGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,\n                     active_rotated_filter_backward_cuda);\n\nvoid ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                 Tensor ious);\n\nvoid ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                  Tensor output);\n\nvoid convex_iou_cuda(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious) {\n  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);\n}\n\nvoid convex_giou_cuda(const Tensor pointsets, const Tensor polygons,\n                      Tensor output) {\n  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);\n}\n\nvoid convex_iou_impl(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious);\n\nvoid convex_giou_impl(const Tensor pointsets, const Tensor polygons,\n                      Tensor output);\n\nREGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);\nREGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);\n\nTensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,\n                                                    Tensor mask,\n                                                    Tensor num_valid);\n\nTensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid) {\n  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,\n                                                      num_valid);\n}\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\nREGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,\n                     diff_iou_rotated_sort_vertices_forward_cuda);\n\nvoid ChamferDistanceForwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,\n    const Tensor dist2, const Tensor idx1, const Tensor idx2);\n\nvoid ChamferDistanceBackwardCUDAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,\n    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);\n\nvoid chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2) {\n  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,\n                                           idx2);\n};\n\nvoid chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2) {\n  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,\n                                            graddist2, gradxyz1, gradxyz2);\n};\n\nvoid chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2);\n\nvoid chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2);\n\nREGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,\n                     chamfer_distance_forward_cuda);\nREGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,\n                     chamfer_distance_backward_cuda);\n\nvoid PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                        Tensor output, int pooled_height,\n                                        int pooled_width, float spatial_scale);\n\nvoid PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                         Tensor grad_input, int pooled_height,\n                                         int pooled_width, float spatial_scale);\n\nvoid PrROIPoolCoorBackwardCUDAKernelLauncher(\n    Tensor output, Tensor grad_output, Tensor input, Tensor rois,\n    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);\n\nvoid prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale) {\n  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,\n                                     pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,\n                                      pooled_height, pooled_width,\n                                      spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale) {\n  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,\n                                          grad_rois, pooled_height,\n                                          pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale);\nvoid prroi_pool_backward_impl(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale);\nvoid prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale);\nREGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);\nREGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);\nREGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,\n                     prroi_pool_coor_backward_cuda);\n\nvoid BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                          Tensor output, int aligned_height,\n                                          int aligned_width,\n                                          float spatial_scale,\n                                          int sampling_ratio, bool aligned);\n\nvoid BezierAlignBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,\n    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);\n\nvoid bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned);\n\nvoid bezier_align_backward_impl(Tensor grad_output, Tensor rois,\n                                Tensor grad_input, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned);\n\nREGISTER_DEVICE_IMPL(bezier_align_forward_impl, CUDA,\n                     BezierAlignForwardCUDAKernelLauncher);\nREGISTER_DEVICE_IMPL(bezier_align_backward_impl, CUDA,\n                     BezierAlignBackwardCUDAKernelLauncher);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"deform_conv_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid deformable_im2col_cuda(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col) {\n  // num_axes should be smaller than block size\n  // todo: check parallel_imgs is correctly passed in\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = channels * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"deformable_im2col_gpu\", ([&] {\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n\n        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),\n                                       THREADS_PER_BLOCK, 0,\n                                       at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_im_, data_offset_, height, width, ksize_h,\n            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, parallel_imgs, channels,\n            deformable_group, height_col, width_col, data_col_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid deformable_col2im_cuda(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im) {\n  // todo: make sure parallel_imgs is passed in correctly\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels =\n      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),\n                                       THREADS_PER_BLOCK, 0,\n                                       at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_col_, data_offset_, channels, height, width,\n            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,\n            dilation_w, channel_per_deformable_group, parallel_imgs,\n            deformable_group, height_col, width_col, grad_im_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid deformable_col2im_coord_cuda(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset) {\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *\n                    deformable_group * parallel_imgs;\n  int channel_per_deformable_group =\n      channels * ksize_h * ksize_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_coord_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n\n        deformable_col2im_coord_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_col_, data_im_, data_offset_, channels, height,\n            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,\n            2 * ksize_h * ksize_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"deform_roi_pool_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                            Tensor offset, Tensor output,\n                                            int pooled_height, int pooled_width,\n                                            float spatial_scale,\n                                            int sampling_ratio, float gamma) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"deform_roi_pool_forward_cuda_kernel\", [&] {\n        deform_roi_pool_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio,\n                static_cast<scalar_t>(gamma), channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid DeformRoIPoolBackwardCUDAKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"deform_roi_pool_backward_cuda_kernel\", [&] {\n        deform_roi_pool_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),\n                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio,\n                static_cast<scalar_t>(gamma), channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Adapted from\n// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa\n#include \"diff_iou_rotated_cuda_kernel.cuh\"\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_cuda_helper.hpp\"\n\nat::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,\n                                                        at::Tensor mask,\n                                                        at::Tensor num_valid) {\n  at::cuda::CUDAGuard device_guard(vertices.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  CHECK_CONTIGUOUS(vertices);\n  CHECK_CONTIGUOUS(mask);\n  CHECK_CONTIGUOUS(num_valid);\n  CHECK_CUDA(vertices);\n  CHECK_CUDA(mask);\n  CHECK_CUDA(num_valid);\n\n  int b = vertices.size(0);\n  int n = vertices.size(1);\n  int m = vertices.size(2);\n  at::Tensor idx =\n      torch::zeros({b, n, MAX_NUM_VERT_IDX},\n                   at::device(vertices.device()).dtype(at::ScalarType::Int));\n\n  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,\n                                                       stream>>>(\n      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),\n      num_valid.data_ptr<int>(), idx.data_ptr<int>());\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  return idx;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n#include <c10/util/Half.h>\n#include <cuda_runtime.h>\n#include <torch/types.h>\n\n#include <cstdint>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n//------------------------------------------------------------------------\n// CUDA kernel parameters.\n\nstruct filtered_lrelu_kernel_params {\n  // These parameters decide which kernel to use.\n  int up;        // upsampling ratio (1, 2, 4)\n  int down;      // downsampling ratio (1, 2, 4)\n  int2 fuShape;  // [size, 1] | [size, size]\n  int2 fdShape;  // [size, 1] | [size, size]\n\n  int _dummy;  // Alignment.\n\n  // Rest of the parameters.\n  const void *x;     // Input tensor.\n  void *y;           // Output tensor.\n  const void *b;     // Bias tensor.\n  unsigned char *s;  // Sign tensor in/out. NULL if unused.\n  const float *fu;   // Upsampling filter.\n  const float *fd;   // Downsampling filter.\n\n  int2 pad0;    // Left/top padding.\n  float gain;   // Additional gain factor.\n  float slope;  // Leaky ReLU slope on negative side.\n  float clamp;  // Clamp after nonlinearity.\n  int flip;     // Filter kernel flip for gradient computation.\n\n  int tilesXdim;  // Original number of horizontal output tiles.\n  int tilesXrep;  // Number of horizontal tiles per CTA.\n  int blockZofs;  // Block z offset to support large minibatch, channel\n                  // dimensions.\n\n  int4 xShape;  // [width, height, channel, batch]\n  int4 yShape;  // [width, height, channel, batch]\n  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if\n                // unused.\n  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.\n  int swLimit;  // Active width of sign tensor in bytes.\n\n  longlong4 xStride;   // Strides of all tensors except signs, same component\n                       // order as shapes.\n  longlong4 yStride;   //\n  int64_t bStride;     //\n  longlong3 fuStride;  //\n  longlong3 fdStride;  //\n};\n\nstruct filtered_lrelu_act_kernel_params {\n  void *x;           // Input/output, modified in-place.\n  unsigned char *s;  // Sign tensor in/out. NULL if unused.\n\n  float gain;   // Additional gain factor.\n  float slope;  // Leaky ReLU slope on negative side.\n  float clamp;  // Clamp after nonlinearity.\n\n  int4 xShape;        // [width, height, channel, batch]\n  longlong4 xStride;  // Input/output tensor strides, same order as in shape.\n  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if\n                // unused.\n  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.\n};\n\n//------------------------------------------------------------------------\n// CUDA kernel specialization.\n\nstruct filtered_lrelu_kernel_spec {\n  void *setup;   // Function for filter kernel setup.\n  void *exec;    // Function for main operation.\n  int2 tileOut;  // Width/height of launch tile.\n  int numWarps;  // Number of warps per thread block, determines launch block\n                 // size.\n  int xrep;      // For processing multiple horizontal tiles per thread block.\n  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.\n};\n\n//------------------------------------------------------------------------\n// CUDA kernel selection.\n\ntemplate <class T, class index_t, bool signWrite, bool signRead>\nfiltered_lrelu_kernel_spec choose_filtered_lrelu_kernel(\n    const filtered_lrelu_kernel_params &p, int sharedKB);\ntemplate <class T, bool signWrite, bool signRead>\nvoid *choose_filtered_lrelu_act_kernel(void);\n\n//------------------------------------------------------------------------\n// Helpers.\n\nenum  // Filter modes.\n{\n  MODE_SUSD = 0,  // Separable upsampling, separable downsampling.\n  MODE_FUSD = 1,  // Full upsampling, separable downsampling.\n  MODE_SUFD = 2,  // Separable upsampling, full downsampling.\n  MODE_FUFD = 3,  // Full upsampling, full downsampling.\n};\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n  typedef double2 vec2_t;\n  typedef double4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_double2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_double4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static double clamp(double x, double c) {\n    return fmin(fmax(x, -c), c);\n  }\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n  typedef float2 vec2_t;\n  typedef float4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_float2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_float4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static float clamp(float x, float c) {\n    return fminf(fmaxf(x, -c), c);\n  }\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n  typedef float2 vec2_t;\n  typedef float4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_float2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_float4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static float clamp(float x, float c) {\n    return fminf(fmaxf(x, -c), c);\n  }\n};\n\n#define MIN(A, B) ((A) < (B) ? (A) : (B))\n#define MAX(A, B) ((A) > (B) ? (A) : (B))\n#define CEIL_DIV(A, B)                  \\\n  (((B) == 1)   ? (A)                   \\\n   : ((B) == 2) ? ((int)((A) + 1) >> 1) \\\n   : ((B) == 4) ? ((int)((A) + 3) >> 2) \\\n                : (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))\n\n// This works only up to blocks of size 256 x 256 and for all N that are powers\n// of two.\ntemplate <int N>\n__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {\n  if ((N & (N - 1)) && N <= 256)\n    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.\n  else\n    y = i / N;\n\n  x = i - y * N;\n}\n\n// Type cast stride before reading it.\ntemplate <class T>\n__device__ __forceinline__ T get_stride(const int64_t &x) {\n  return *reinterpret_cast<const T *>(&x);\n}\n\n//------------------------------------------------------------------------\n// Filters, setup kernel, copying function.\n\n#define MAX_FILTER_SIZE 32\n\n// Combined up/down filter buffers so that transfer can be done with one copy.\n__device__ float\n    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,\n                                                    // written by setup kernel.\n__device__ __constant__ float\n    c_fbuf[2 * MAX_FILTER_SIZE *\n           MAX_FILTER_SIZE];  // Filters in constant memory, read by main\n                              // kernel.\n\n// Accessors to combined buffers to index up/down filters individually.\n#define c_fu (c_fbuf)\n#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)\n#define g_fu (g_fbuf)\n#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)\n\n// Set up filters into global memory buffer.\nstatic __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {\n  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;\n       idx += blockDim.x) {\n    int x, y;\n    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);\n\n    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);\n    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);\n    if (p.fuShape.y > 0)\n      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)\n                      ? 0.0f\n                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];\n    else\n      g_fu[idx] =\n          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];\n\n    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);\n    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);\n    if (p.fdShape.y > 0)\n      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)\n                      ? 0.0f\n                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];\n    else\n      g_fd[idx] =\n          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];\n  }\n}\n\n// Host function to copy filters written by setup kernel into constant buffer\n// for main kernel.\nstatic cudaError_t copy_filters(cudaStream_t stream) {\n  void *src = 0;\n  cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);\n  if (err) return err;\n  return cudaMemcpyToSymbolAsync(\n      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,\n      cudaMemcpyDeviceToDevice, stream);\n}\n\n//------------------------------------------------------------------------\n// Coordinate spaces:\n// - Relative to input tensor:      inX, inY, tileInX, tileInY\n// - Relative to input tile:        relInX, relInY, tileInW, tileInH\n// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH\n// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH\n// - Relative to output tensor:     outX, outY, tileOutX, tileOutY\n//\n// Relationships between coordinate spaces:\n// - inX = tileInX + relInX\n// - inY = tileInY + relInY\n// - relUpX = relInX * up + phaseInX\n// - relUpY = relInY * up + phaseInY\n// - relUpX = relOutX * down\n// - relUpY = relOutY * down\n// - outX = tileOutX + relOutX\n// - outY = tileOutY + relOutY\n\nextern __shared__ char\n    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically\n                  // inside the kernel, otherwise use the externally allocated\n                  // shared memory buffer.\n\ntemplate <class T, class index_t, int sharedKB, bool signWrite, bool signRead,\n          int filterMode, int up, int fuSize, int down, int fdSize,\n          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,\n          bool enableWriteSkip>\nstatic __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {\n  // Check that we don't try to support non-existing filter modes.\n  static_assert(up == 1 || up == 2 || up == 4,\n                \"only up=1, up=2, up=4 scales supported\");\n  static_assert(down == 1 || down == 2 || down == 4,\n                \"only down=1, down=2, down=4 scales supported\");\n  static_assert(fuSize >= up,\n                \"upsampling filter size must be at least upsampling factor\");\n  static_assert(\n      fdSize >= down,\n      \"downsampling filter size must be at least downsampling factor\");\n  static_assert(\n      fuSize % up == 0,\n      \"upsampling filter size must be divisible with upsampling factor\");\n  static_assert(\n      fdSize % down == 0,\n      \"downsampling filter size must be divisible with downsampling factor\");\n  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,\n                \"filter size greater than MAX_FILTER_SIZE\");\n  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||\n                                            filterMode == MODE_FUSD)),\n                \"up=1 supported only for 1x1 full filters\");\n  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||\n                                              filterMode == MODE_SUFD)),\n                \"down=1 supported only for 1x1 full filters\");\n  static_assert(\n      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),\n      \"full filters not supported for up=4\");\n  static_assert(\n      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),\n      \"full filters not supported for down=4\");\n\n  // Static definitions.\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  typedef typename InternalType<T>::vec2_t vec2_t;\n  typedef typename InternalType<T>::vec4_t vec4_t;\n  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &\n                      ~3;  // Upsampled tile width, rounded up to multiple of 4.\n  const int tileUpH =\n      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.\n  const int tileInW =\n      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.\n  const int tileInH =\n      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.\n  const int tileUpH_up =\n      CEIL_DIV(tileUpH, up) *\n      up;  // Upsampled tile height rounded up to a multiple of up.\n  const int tileInH_up =\n      CEIL_DIV(tileUpH_up + (fuSize - 1),\n               up);  // For allocations only, to avoid shared memory read\n                     // overruns with up=2 and up=4.\n\n  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.\n  const bool downInline =\n      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||\n                      (up == 2 && filterMode == MODE_SUFD));\n\n  // Sizes of logical buffers.\n  const int szIn = tileInH_up * tileInW;\n  const int szUpX = tileInH_up * tileUpW;\n  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);\n  const int szDownX = tileUpH * tileOutW;\n\n  // Sizes for shared memory arrays.\n  const int s_buf0_size_base = (filterMode == MODE_SUSD)   ? MAX(szIn, szUpXY)\n                               : (filterMode == MODE_FUSD) ? MAX(szIn, szDownX)\n                               : (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY)\n                               : (filterMode == MODE_FUFD) ? szIn\n                                                           : -1;\n  const int s_buf1_size_base = (filterMode == MODE_SUSD)   ? MAX(szUpX, szDownX)\n                               : (filterMode == MODE_FUSD) ? szUpXY\n                               : (filterMode == MODE_SUFD) ? szUpX\n                               : (filterMode == MODE_FUFD) ? szUpXY\n                                                           : -1;\n\n  // Ensure U128 alignment.\n  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;\n  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;\n\n  // Check at compile time that we don't use too much shared memory.\n  static_assert(\n      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),\n      \"shared memory overflow\");\n\n  // Declare shared memory arrays.\n  scalar_t *s_buf0;\n  scalar_t *s_buf1;\n  if (sharedKB <= 48) {\n    // Allocate shared memory arrays here.\n    __shared__ scalar_t\n        s_buf0_st[(sharedKB > 48)\n                      ? (1 << 24)\n                      : (s_buf0_size +\n                         s_buf1_size)];  // Prevent launching if this isn't\n                                         // optimized away when unused.\n    s_buf0 = s_buf0_st;\n    s_buf1 = s_buf0 + s_buf0_size;\n  } else {\n    // Use the dynamically allocated shared memory array.\n    s_buf0 = (scalar_t *)s_buf_raw;\n    s_buf1 = s_buf0 + s_buf0_size;\n  }\n\n  // Pointers to the buffers.\n  scalar_t *\n      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]\n  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +\n                         // relUpX]\n  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +\n                         // relUpX]\n  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW\n                          // + relOutX]\n  if (filterMode == MODE_SUSD) {\n    s_tileIn = s_buf0;\n    s_tileUpX = s_buf1;\n    s_tileUpXY = s_buf0;\n    s_tileDownX = s_buf1;\n  } else if (filterMode == MODE_FUSD) {\n    s_tileIn = s_buf0;\n    s_tileUpXY = s_buf1;\n    s_tileDownX = s_buf0;\n  } else if (filterMode == MODE_SUFD) {\n    s_tileIn = s_buf0;\n    s_tileUpX = s_buf1;\n    s_tileUpXY = s_buf0;\n  } else if (filterMode == MODE_FUFD) {\n    s_tileIn = s_buf0;\n    s_tileUpXY = s_buf1;\n  }\n\n  // Allow large grids in z direction via per-launch offset.\n  int channelIdx = blockIdx.z + p.blockZofs;\n  int batchIdx = channelIdx / p.yShape.z;\n  channelIdx -= batchIdx * p.yShape.z;\n\n  // Offset to output feature map. In bytes.\n  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +\n                      batchIdx * get_stride<index_t>(p.yStride.w);\n\n  // Sign shift amount.\n  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;\n\n// Inner tile loop.\n#pragma unroll 1\n  for (int tileIdx = 0;\n       !enableXrep ||\n       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));\n       tileIdx++) {\n    // Locate output tile.\n    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;\n    int tileOutX = tileX * tileOutW;\n    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;\n\n    // Locate input tile.\n    int tmpX = tileOutX * down - p.pad0.x;\n    int tmpY = tileOutY * down - p.pad0.y;\n    int tileInX = CEIL_DIV(tmpX, up);\n    int tileInY = CEIL_DIV(tmpY, up);\n    const int phaseInX = tileInX * up - tmpX;\n    const int phaseInY = tileInY * up - tmpY;\n\n    // Extra sync if input and output buffers are the same and we are not on\n    // first tile.\n    if (enableXrep && tileIdx > 0 &&\n        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||\n         (filterMode == MODE_FUFD && downInline)))\n      __syncthreads();\n\n    // Load input tile & apply bias. Unrolled.\n    scalar_t b =\n        (scalar_t) * (const T *)((const char *)p.b +\n                                 (channelIdx * get_stride<index_t>(p.bStride)));\n    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +\n                       batchIdx * get_stride<index_t>(p.xStride.w);\n    int idx = threadIdx.x;\n    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);\n#pragma unroll\n    for (int loop = 0; loop < loopCountIN; loop++) {\n      int relInX, relInY;\n      fast_div_mod<tileInW>(relInX, relInY, idx);\n      int inX = tileInX + relInX;\n      int inY = tileInY + relInY;\n      scalar_t v = 0;\n\n      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)\n        v = (scalar_t) * ((const T *)((const char *)p.x +\n                                      (inX * get_stride<index_t>(p.xStride.x) +\n                                       inY * get_stride<index_t>(p.xStride.y) +\n                                       mapOfsIn))) +\n            b;\n\n      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);\n      if (!skip) s_tileIn[idx] = v;\n\n      idx += threadsPerBlock;\n    }\n\n    if (filterMode == MODE_SUSD ||\n        filterMode == MODE_SUFD)  // Separable upsampling filter.\n    {\n      // Horizontal upsampling.\n      __syncthreads();\n      if (up == 4) {\n        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;\n             idx += blockDim.x * up) {\n          int relUpX0, relInY;\n          fast_div_mod<tileUpW>(relUpX0, relInY, idx);\n          int relInX0 = relUpX0 / up;\n          int src0 = relInX0 + tileInW * relInY;\n          int dst = relInY * tileUpW + relUpX0;\n          vec4_t v = InternalType<T>::zero_vec4();\n          scalar_t a = s_tileIn[src0];\n          if (phaseInX == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 3];\n              v.z += a * (scalar_t)c_fu[step * up + 2];\n              v.w += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else if (phaseInX == 1) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 3];\n              v.w += a * (scalar_t)c_fu[step * up + 2];\n            }\n          } else if (phaseInX == 2) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 2];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 3];\n            }\n          } else  // (phaseInX == 3)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 3];\n              v.y += a * (scalar_t)c_fu[step * up + 2];\n              v.z += a * (scalar_t)c_fu[step * up + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n            }\n          }\n          s_tileUpX[dst + 0] = v.x;\n          s_tileUpX[dst + 1] = v.y;\n          s_tileUpX[dst + 2] = v.z;\n          s_tileUpX[dst + 3] = v.w;\n        }\n      } else if (up == 2) {\n        bool p0 = (phaseInX == 0);\n        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;\n             idx += blockDim.x * up) {\n          int relUpX0, relInY;\n          fast_div_mod<tileUpW>(relUpX0, relInY, idx);\n          int relInX0 = relUpX0 / up;\n          int src0 = relInX0 + tileInW * relInY;\n          int dst = relInY * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n          scalar_t a = s_tileIn[src0];\n          if (p0)  // (phaseInX == 0)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else  // (phaseInX == 1)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n            }\n          }\n          s_tileUpX[dst + 0] = v.x;\n          s_tileUpX[dst + 1] = v.y;\n        }\n      }\n\n      // Vertical upsampling & nonlinearity.\n\n      __syncthreads();\n      int groupMask = 15 << ((threadIdx.x & 31) & ~3);\n      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH\n                          : 0;  // Skip already written signs.\n      int sShapeMaxY =\n          MIN(p.sShape.y,\n              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.\n      if (up == 4) {\n        minY -= 3;  // Adjust according to block height.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;\n             idx += blockDim.x) {\n          int relUpX, relInY0;\n          fast_div_mod<tileUpW>(relUpX, relInY0, idx);\n          int relUpY0 = relInY0 * up;\n          int src0 = relInY0 * tileUpW + relUpX;\n          int dst = relUpY0 * tileUpW + relUpX;\n          vec4_t v = InternalType<T>::zero_vec4();\n\n          scalar_t a = s_tileUpX[src0];\n          if (phaseInY == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.y += a * (scalar_t)c_fu[step * up + 3];\n              v.z += a * (scalar_t)c_fu[step * up + 2];\n              v.w += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else if (phaseInY == 1) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.z += a * (scalar_t)c_fu[step * up + 3];\n              v.w += a * (scalar_t)c_fu[step * up + 2];\n            }\n          } else if (phaseInY == 2) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 2];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.w += a * (scalar_t)c_fu[step * up + 3];\n            }\n          } else  // (phaseInY == 3)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 3];\n              v.y += a * (scalar_t)c_fu[step * up + 2];\n              v.z += a * (scalar_t)c_fu[step * up + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n            }\n          }\n\n          int x = tileOutX * down + relUpX;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si0 =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          index_t si1 = si0 + p.sShape.x;\n          index_t si2 = si0 + p.sShape.x * 2;\n          index_t si3 = si0 + p.sShape.x * 3;\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n          v.z *= (scalar_t)((float)up * (float)up * p.gain);\n          v.w *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31 << 0;\n              int sy = __float_as_uint(v.y) >> 31 << 8;\n              int sz = __float_as_uint(v.z) >> 31 << 16;\n              int sw = __float_as_uint(v.w) >> 31 << 24;\n              if (sx) v.x *= p.slope;\n              if (sy) v.y *= p.slope;\n              if (sz) v.z *= p.slope;\n              if (sw) v.w *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2 << 0;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2 << 8;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n              if (fabsf(v.z) > p.clamp) {\n                sz = 2 << 16;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n              }\n              if (fabsf(v.w) > p.clamp) {\n                sw = 2 << 24;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                // Combine signs.\n                uint32_t s = sx + sy + sw + sz;\n                s <<= (signX & 3) << 1;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n                if ((uint32_t)(signY + 2) < sShapeMaxY) {\n                  p.s[si2] = (unsigned char)(s >> 16);\n                }\n                if ((uint32_t)(signY + 3) < sShapeMaxY) {\n                  p.s[si3] = (unsigned char)(s >> 24);\n                }\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31 << 0;\n                int sy = __float_as_uint(v.y) >> 31 << 8;\n                int sz = __float_as_uint(v.z) >> 31 << 16;\n                int sw = __float_as_uint(v.w) >> 31 << 24;\n                if (sx) v.x *= p.slope;\n                if (sy) v.y *= p.slope;\n                if (sz) v.z *= p.slope;\n                if (sw) v.w *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2 << 0;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2 << 8;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n                if (fabsf(v.z) > p.clamp) {\n                  sz = 2 << 16;\n                  v.z = InternalType<T>::clamp(v.z, p.clamp);\n                }\n                if (fabsf(v.w) > p.clamp) {\n                  sw = 2 << 24;\n                  v.w = InternalType<T>::clamp(v.w, p.clamp);\n                }\n\n                // Combine signs.\n                uint32_t s = sx + sy + sw + sz;\n                s <<= (signX & 3) << 1;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n                if ((uint32_t)(signY + 2) < sShapeMaxY) {\n                  p.s[si2] = (unsigned char)(s >> 16);\n                }\n                if ((uint32_t)(signY + 3) < sShapeMaxY) {\n                  p.s[si3] = (unsigned char)(s >> 24);\n                }\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n                if (v.z < 0.f) v.z *= p.slope;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n                if (v.w < 0.f) v.w *= p.slope;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read signs and apply.\n          {\n            if ((uint32_t)signXb < p.swLimit) {\n              int ss = (signX & 3) << 1;\n              if ((uint32_t)(signY + 0) < p.sShape.y) {\n                int s = p.s[si0] >> ss;\n                if (s & 1) v.x *= p.slope;\n                if (s & 2) v.x = 0.f;\n              }\n              if ((uint32_t)(signY + 1) < p.sShape.y) {\n                int s = p.s[si1] >> ss;\n                if (s & 1) v.y *= p.slope;\n                if (s & 2) v.y = 0.f;\n              }\n              if ((uint32_t)(signY + 2) < p.sShape.y) {\n                int s = p.s[si2] >> ss;\n                if (s & 1) v.z *= p.slope;\n                if (s & 2) v.z = 0.f;\n              }\n              if ((uint32_t)(signY + 3) < p.sShape.y) {\n                int s = p.s[si3] >> ss;\n                if (s & 1) v.w *= p.slope;\n                if (s & 2) v.w = 0.f;\n              }\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n            if (v.z < 0.f) v.z *= p.slope;\n            v.z = InternalType<T>::clamp(v.z, p.clamp);\n            if (v.w < 0.f) v.w *= p.slope;\n            v.w = InternalType<T>::clamp(v.w, p.clamp);\n          }\n\n          s_tileUpXY[dst + 0 * tileUpW] = v.x;\n          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;\n          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;\n          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;\n        }\n      } else if (up == 2) {\n        minY -= 1;  // Adjust according to block height.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;\n             idx += blockDim.x) {\n          int relUpX, relInY0;\n          fast_div_mod<tileUpW>(relUpX, relInY0, idx);\n          int relUpY0 = relInY0 * up;\n          int src0 = relInY0 * tileUpW + relUpX;\n          int dst = relUpY0 * tileUpW + relUpX;\n          vec2_t v = InternalType<T>::zero_vec2();\n\n          scalar_t a = s_tileUpX[src0];\n          if (phaseInY == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else  // (phaseInY == 1)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n            }\n          }\n\n          int x = tileOutX * down + relUpX;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si0 =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          index_t si1 = si0 + p.sShape.x;\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31 << 0;\n              int sy = __float_as_uint(v.y) >> 31 << 8;\n              if (sx) v.x *= p.slope;\n              if (sy) v.y *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2 << 0;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2 << 8;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                // Combine signs.\n                int s = sx + sy;\n                s <<= signXo;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31 << 0;\n                int sy = __float_as_uint(v.y) >> 31 << 8;\n                if (sx) v.x *= p.slope;\n                if (sy) v.y *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2 << 0;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2 << 8;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n\n                // Combine signs.\n                int s = sx + sy;\n                s <<= signXo;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read signs and apply.\n          {\n            if ((uint32_t)signXb < p.swLimit) {\n              if ((uint32_t)(signY + 0) < p.sShape.y) {\n                int s = p.s[si0] >> signXo;\n                if (s & 1) v.x *= p.slope;\n                if (s & 2) v.x = 0.f;\n              }\n              if ((uint32_t)(signY + 1) < p.sShape.y) {\n                int s = p.s[si1] >> signXo;\n                if (s & 1) v.y *= p.slope;\n                if (s & 2) v.y = 0.f;\n              }\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n          }\n\n          if (!downInline) {\n            // Write into temporary buffer.\n            s_tileUpXY[dst] = v.x;\n            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;\n          } else {\n            // Write directly into output buffer.\n            if ((uint32_t)x < p.yShape.x) {\n              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);\n              index_t ofs = x * get_stride<index_t>(p.yStride.x) +\n                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;\n              if ((uint32_t)y + 0 < p.yShape.y)\n                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);\n              if ((uint32_t)y + 1 < ymax)\n                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =\n                    (T)(v.y * (scalar_t)c_fd[0]);\n            }\n          }\n        }\n      }\n    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {\n      // Full upsampling filter.\n\n      if (up == 2) {\n        // 2 x 2-wide.\n        __syncthreads();\n        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y\n                            : 0;  // Skip already written signs.\n        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;\n             idx += blockDim.x * 4) {\n          int relUpX0, relUpY0;\n          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);\n          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);\n          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);\n          int src0 = relInX0 + tileInW * relInY0;\n          int tap0y = (relInY0 * up + phaseInY - relUpY0);\n\n#define X_LOOP(TAPY, PX)                                             \\\n  for (int sx = 0; sx < fuSize / up; sx++) {                         \\\n    v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) +  \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) +  \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    if ((PX) == 0) {                                                 \\\n      a = b;                                                         \\\n      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \\\n    }                                                                \\\n    v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) +  \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) +  \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    if ((PX) == 1) {                                                 \\\n      a = b;                                                         \\\n      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \\\n    }                                                                \\\n  }\n\n          vec4_t v = InternalType<T>::zero_vec4();\n          if (tap0y == 0 && phaseInX == 0)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(0, 0)\n            }\n          if (tap0y == 0 && phaseInX == 1)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(0, 1)\n            }\n          if (tap0y == 1 && phaseInX == 0)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(1, 0)\n            }\n          if (tap0y == 1 && phaseInX == 1)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(1, 1)\n            }\n\n#undef X_LOOP\n\n          int x = tileOutX * down + relUpX0;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n          v.z *= (scalar_t)((float)up * (float)up * p.gain);\n          v.w *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31;\n              int sy = __float_as_uint(v.y) >> 31;\n              int sz = __float_as_uint(v.z) >> 31;\n              int sw = __float_as_uint(v.w) >> 31;\n              if (sx) v.x *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (sy) v.y *= p.slope;\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n              if (sz) v.z *= p.slope;\n              if (fabsf(v.z) > p.clamp) {\n                sz = 2;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n              }\n              if (sw) v.w *= p.slope;\n              if (fabsf(v.w) > p.clamp) {\n                sw = 2;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31;\n                int sy = __float_as_uint(v.y) >> 31;\n                int sz = __float_as_uint(v.z) >> 31;\n                int sw = __float_as_uint(v.w) >> 31;\n                if (sx) v.x *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (sy) v.y *= p.slope;\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n                if (sz) v.z *= p.slope;\n                if (fabsf(v.z) > p.clamp) {\n                  sz = 2;\n                  v.z = InternalType<T>::clamp(v.z, p.clamp);\n                }\n                if (sw) v.w *= p.slope;\n                if (fabsf(v.w) > p.clamp) {\n                  sw = 2;\n                  v.w = InternalType<T>::clamp(v.w, p.clamp);\n                }\n\n                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n                if (v.z < 0.f) v.z *= p.slope;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n                if (v.w < 0.f) v.w *= p.slope;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read sign and apply.\n          {\n            if ((uint32_t)signY < p.sShape.y) {\n              int s = 0;\n              if ((uint32_t)signXb < p.swLimit) s = p.s[si];\n              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;\n              s >>= (signX & 3) << 1;\n              if (s & 0x01) v.x *= p.slope;\n              if (s & 0x02) v.x = 0.f;\n              if (s & 0x04) v.y *= p.slope;\n              if (s & 0x08) v.y = 0.f;\n              if (s & 0x10) v.z *= p.slope;\n              if (s & 0x20) v.z = 0.f;\n              if (s & 0x40) v.w *= p.slope;\n              if (s & 0x80) v.w = 0.f;\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n            if (v.z < 0.f) v.z *= p.slope;\n            v.z = InternalType<T>::clamp(v.z, p.clamp);\n            if (v.w < 0.f) v.w *= p.slope;\n            v.w = InternalType<T>::clamp(v.w, p.clamp);\n          }\n\n          s_tileUpXY[idx + 0] = v.x;\n          s_tileUpXY[idx + 1] = v.y;\n          s_tileUpXY[idx + 2] = v.z;\n          s_tileUpXY[idx + 3] = v.w;\n        }\n      } else if (up == 1) {\n        __syncthreads();\n        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);\n        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH\n                            : 0;  // Skip already written signs.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;\n             idx += blockDim.x) {\n          int relUpX0, relUpY0;\n          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);\n          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.\n\n          int x = tileOutX * down + relUpX0;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          v *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write sign.\n              uint32_t s = 0;\n              uint32_t signXbit = (1u << signXo);\n              if (v < 0.f) {\n                s = signXbit;\n                v *= p.slope;\n              }\n              if (fabsf(v) > p.clamp) {\n                s = signXbit * 2;\n                v = InternalType<T>::clamp(v, p.clamp);\n              }\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n#ifdef MMCV_WITH_HIP\n                s += __shfl_xor(s, 1);  // Coalesce.\n                s += __shfl_xor(s, 2);  // Coalesce.\n#else\n                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.\n                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.\n#endif\n                p.s[si] = s;  // Write.\n              }\n            } else {\n              // Determine and write sign.\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                uint32_t s = 0;\n                uint32_t signXbit = (1u << signXo);\n                if (v < 0.f) {\n                  s = signXbit;\n                  v *= p.slope;\n                }\n                if (fabsf(v) > p.clamp) {\n                  s = signXbit * 2;\n                  v = InternalType<T>::clamp(v, p.clamp);\n                }\n#ifdef MMCV_WITH_HIP\n                s += __shfl_xor(s, 1);  // Coalesce.\n                s += __shfl_xor(s, 2);  // Coalesce.\n#else\n                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.\n                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.\n#endif\n                p.s[si] = s;  // Write.\n              } else {\n                // Just compute the value.\n                if (v < 0.f) v *= p.slope;\n                v = InternalType<T>::clamp(v, p.clamp);\n              }\n            }\n          } else if (signRead) {\n            // Read sign and apply if within sign tensor bounds.\n            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {\n              int s = p.s[si];\n              s >>= signXo;\n              if (s & 1) v *= p.slope;\n              if (s & 2) v = 0.f;\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v < 0.f) v *= p.slope;\n            v = InternalType<T>::clamp(v, p.clamp);\n          }\n\n          if (!downInline)  // Write into temporary buffer.\n            s_tileUpXY[idx] = v;\n          else if ((uint32_t)x < p.yShape.x &&\n                   (uint32_t)y <\n                       p.yShape.y)  // Write directly into output buffer\n            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +\n                                   y * get_stride<index_t>(p.yStride.y) +\n                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);\n        }\n      }\n    }\n\n    // Downsampling.\n    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {\n      // Horizontal downsampling.\n      __syncthreads();\n      if (down == 4 && tileOutW % 4 == 0) {\n        // Calculate 4 pixels at a time.\n        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;\n             idx += blockDim.x * 4) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src0 = relUpY * tileUpW + relUpX0;\n          vec4_t v = InternalType<T>::zero_vec4();\n#pragma unroll\n          for (int step = 0; step < fdSize; step++) {\n            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];\n            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];\n            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];\n            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];\n          }\n          s_tileDownX[idx + 0] = v.x;\n          s_tileDownX[idx + 1] = v.y;\n          s_tileDownX[idx + 2] = v.z;\n          s_tileDownX[idx + 3] = v.w;\n        }\n      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {\n        // Calculate 2 pixels at a time.\n        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;\n             idx += blockDim.x * 2) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src0 = relUpY * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n#pragma unroll\n          for (int step = 0; step < fdSize; step++) {\n            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];\n            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];\n          }\n          s_tileDownX[idx + 0] = v.x;\n          s_tileDownX[idx + 1] = v.y;\n        }\n      } else {\n        // Calculate 1 pixel at a time.\n        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;\n             idx += blockDim.x) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src = relUpY * tileUpW + relUpX0;\n          scalar_t v = 0.f;\n#pragma unroll\n          for (int step = 0; step < fdSize; step++)\n            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];\n          s_tileDownX[idx] = v;\n        }\n      }\n\n      // Vertical downsampling & store output tile.\n      __syncthreads();\n      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;\n           idx += blockDim.x) {\n        int relOutX, relOutY0;\n        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);\n        int relUpY0 = relOutY0 * down;\n        int src0 = relUpY0 * tileOutW + relOutX;\n        scalar_t v = 0;\n#pragma unroll\n        for (int step = 0; step < fdSize; step++)\n          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];\n\n        int outX = tileOutX + relOutX;\n        int outY = tileOutY + relOutY0;\n\n        if (outX < p.yShape.x & outY < p.yShape.y)\n          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +\n                                 outY * get_stride<index_t>(p.yStride.y) +\n                                 mapOfsOut))) = (T)v;\n      }\n    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {\n      // Full downsampling filter.\n      if (down == 2) {\n        // 2-wide.\n        __syncthreads();\n        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;\n             idx += blockDim.x * 2) {\n          int relOutX0, relOutY0;\n          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);\n          int relUpX0 = relOutX0 * down;\n          int relUpY0 = relOutY0 * down;\n          int src0 = relUpY0 * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n#pragma unroll\n          for (int sy = 0; sy < fdSize; sy++)\n#pragma unroll\n            for (int sx = 0; sx < fdSize; sx++) {\n              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *\n                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];\n              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *\n                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];\n            }\n\n          int outX = tileOutX + relOutX0;\n          int outY = tileOutY + relOutY0;\n          if ((uint32_t)outY < p.yShape.y) {\n            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +\n                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;\n            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;\n            if (outX + 1 < p.yShape.x)\n              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =\n                  (T)v.y;\n          }\n        }\n      } else if (down == 1 && !downInline) {\n        // Thread per pixel.\n        __syncthreads();\n        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;\n             idx += blockDim.x) {\n          int relOutX0, relOutY0;\n          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);\n          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.\n\n          int outX = tileOutX + relOutX0;\n          int outY = tileOutY + relOutY0;\n          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)\n            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +\n                                   outY * get_stride<index_t>(p.yStride.y) +\n                                   mapOfsOut))) = (T)v;\n        }\n      }\n    }\n\n    if (!enableXrep) break;\n  }\n}\n\n//------------------------------------------------------------------------\n// Compute activation function and signs for upsampled data tensor, modifying\n// data tensor in-place. Used for accelerating the generic variant. Sign tensor\n// is known to be contiguous, and p.x and p.s have the same z, w dimensions.\n// 64-bit indexing is always used.\n\ntemplate <class T, bool signWrite, bool signRead>\nstatic __global__ void filtered_lrelu_act_kernel(\n    filtered_lrelu_act_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n\n  // Indexing.\n  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;\n  int32_t qmax =\n      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.\n\n  // Loop to accommodate oversized tensors.\n  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)\n    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {\n      // Extract z and w (channel, minibatch index).\n      int32_t w = q / p.xShape.z;\n      int32_t z = q - w * p.xShape.z;\n\n      // Choose behavior based on sign read/write mode.\n      if (signWrite) {\n        // Process value if in p.x.\n        uint32_t s = 0;\n        if (x < p.xShape.x && y < p.xShape.y) {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n\n          // Gain, LReLU, clamp.\n          v *= p.gain;\n          if (v < 0.f) {\n            v *= p.slope;\n            s = 1;  // Sign.\n          }\n          if (fabsf(v) > p.clamp) {\n            v = InternalType<T>::clamp(v, p.clamp);\n            s = 2;  // Clamp.\n          }\n\n          *pv = (T)v;  // Write value.\n        }\n\n        // Coalesce into threads 0 and 16 of warp.\n        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;\n        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.\n#ifdef MMCV_WITH_HIP\n        s |= __shfl_xor(s, 1);  // Distribute.\n        s |= __shfl_xor(s, 2);\n        s |= __shfl_xor(s, 4);\n        s |= __shfl_xor(s, 8);\n#else\n        s |= __shfl_xor_sync(m, s, 1);  // Distribute.\n        s |= __shfl_xor_sync(m, s, 2);\n        s |= __shfl_xor_sync(m, s, 4);\n        s |= __shfl_xor_sync(m, s, 8);\n#endif\n\n        // Write signs if leader and in p.s.\n        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.\n        {\n          uint64_t is =\n              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.\n          ((uint32_t *)p.s)[is >> 4] = s;\n        }\n      } else if (signRead) {\n        // Process value if in p.x.\n        if (x < p.xShape.x)  // y is always in.\n        {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n          v *= p.gain;\n\n          // Apply sign buffer offset.\n          uint32_t sx = x + p.sOfs.x;\n          uint32_t sy = y + p.sOfs.y;\n\n          // Read and apply signs if we land inside valid region of sign buffer.\n          if (sx < p.sShape.x && sy < p.sShape.y) {\n            uint64_t is =\n                (sx >> 2) + (p.sShape.x >> 2) *\n                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.\n            unsigned char s = p.s[is];\n            s >>= (sx & 3) << 1;  // Shift into place.\n            if (s & 1)            // Sign?\n              v *= p.slope;\n            if (s & 2)  // Clamp?\n              v = 0.f;\n          }\n\n          *pv = (T)v;  // Write value.\n        }\n      } else {\n        // Forward pass with no sign write. Process value if in p.x.\n        if (x < p.xShape.x)  // y is always in.\n        {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n          v *= p.gain;\n          if (v < 0.f) v *= p.slope;\n          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);\n          *pv = (T)v;  // Write value.\n        }\n      }\n    }\n}\n\ntemplate <class T, bool signWrite, bool signRead>\nvoid *choose_filtered_lrelu_act_kernel(void) {\n  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;\n}\n\n//------------------------------------------------------------------------\n// CUDA kernel selection.\n\ntemplate <class T, class index_t, bool signWrite, bool signRead>\nfiltered_lrelu_kernel_spec choose_filtered_lrelu_kernel(\n    const filtered_lrelu_kernel_params &p, int sharedKB) {\n  filtered_lrelu_kernel_spec s = {0};\n\n  // Return the first matching kernel.\n#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \\\n  if (sharedKB >= SH)                                                          \\\n    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \\\n        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \\\n      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \\\n          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \\\n        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \\\n            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \\\n          static_assert((D * TW % 4) == 0,                                     \\\n                        \"down * tileWidth must be divisible by 4\");            \\\n          static_assert(                                                       \\\n              FU % U == 0,                                                     \\\n              \"upscaling filter size must be multiple of upscaling factor\");   \\\n          static_assert(FD % D == 0,                                           \\\n                        \"downscaling filter size must be multiple of \"         \\\n                        \"downscaling factor\");                                 \\\n          s.setup = (void *)setup_filters_kernel;                              \\\n          s.exec = (void *)                                                    \\\n              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \\\n                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \\\n          s.tileOut = make_int2(TW, TH);                                       \\\n          s.numWarps = W;                                                      \\\n          s.xrep = XR;                                                         \\\n          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \\\n          return s;                                                            \\\n        }\n\n  // Launch parameters for various kernel specializations.\n  // Small filters must be listed before large filters, otherwise the kernel for\n  // larger filter will always match first. Kernels that use more shared memory\n  // must be listed before those that use less, for the same reason.\n\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,\n       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4\n\n#undef CASE\n  return s;  // No kernel found.\n}\n\n//------------------------------------------------------------------------\n\n#define BUILD_FILTERED_LRELU_OP 1\n\n#ifndef MMCV_WITH_HIP\n#ifdef __GNUC__\n#if __GNUC__ < 6\n#undef BUILD_FILTERED_LRELU_OP\n#define BUILD_FILTERED_LRELU_OP 0\n#endif\n#endif\n\n#if CUDA_VERSION < 10020\n#undef BUILD_FILTERED_LRELU_OP\n#define BUILD_FILTERED_LRELU_OP 0\n#endif\n#endif\n\n#if BUILD_FILTERED_LRELU_OP == 1\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns) {\n  // Set CUDA device.\n  TORCH_CHECK(x.is_cuda(), \"x must reside on CUDA device\");\n  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));\n\n  // Validate arguments.\n  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&\n                  b.device() == x.device(),\n              \"all input tensors must reside on the same device\");\n  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,\n              \"fu and fd must be float32\");\n  TORCH_CHECK(b.dtype() == x.dtype(), \"x and b must have the same dtype\");\n  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,\n              \"x and b must be float16 or float32\");\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&\n                  x.size(3) <= INT_MAX,\n              \"x is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x is empty\");\n  TORCH_CHECK(\n      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),\n      \"fu and fd must be rank 1 or 2\");\n  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,\n              \"fu is too large\");\n  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,\n              \"fd is too large\");\n  TORCH_CHECK(fu.numel() > 0, \"fu is empty\");\n  TORCH_CHECK(fd.numel() > 0, \"fd is empty\");\n  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),\n              \"b must be a vector with the same number of channels as x\");\n  TORCH_CHECK(up >= 1 && down >= 1, \"up and down must be at least 1\");\n\n  // Figure out how much shared memory is available on the device.\n  int maxSharedBytes = 0;\n#ifdef MMCV_WITH_HIP\n  cudaDeviceGetAttribute(&maxSharedBytes,\n                         hipDeviceAttributeMaxSharedMemoryPerBlock,\n                         x.device().index());\n#else\n  AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,\n                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,\n                                       x.device().index()));\n#endif\n  int sharedKB = maxSharedBytes >> 10;\n\n  // Populate enough launch parameters to check if a CUDA kernel exists.\n  filtered_lrelu_kernel_params p;\n  p.up = up;\n  p.down = down;\n  p.fuShape =\n      make_int2((int)fu.size(-1),\n                fu.dim() == 2 ? (int)fu.size(0)\n                              : 0);  // shape [n, 0] indicates separable filter.\n  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);\n  filtered_lrelu_kernel_spec test_spec =\n      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);\n  if (!test_spec.exec) {\n    // No kernel found - return empty tensors and indicate missing kernel with\n    // return code of -1.\n    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);\n  }\n\n  // Input/output element size.\n  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;\n\n  // Input sizes.\n  int64_t xw = (int)x.size(3);\n  int64_t xh = (int)x.size(2);\n  int64_t fut_w = (int)fu.size(-1) - 1;\n  int64_t fut_h = (int)fu.size(0) - 1;\n  int64_t fdt_w = (int)fd.size(-1) - 1;\n  int64_t fdt_h = (int)fd.size(0) - 1;\n\n  // Logical size of upsampled buffer.\n  int64_t cw = xw * up + (px0 + px1) - fut_w;\n  int64_t ch = xh * up + (py0 + py1) - fut_h;\n  TORCH_CHECK(\n      cw > fdt_w && ch > fdt_h,\n      \"upsampled buffer must be at least the size of downsampling filter\");\n  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, \"upsampled buffer is too large\");\n\n  // Compute output size and allocate.\n  int64_t yw = (cw - fdt_w + (down - 1)) / down;\n  int64_t yh = (ch - fdt_h + (down - 1)) / down;\n  TORCH_CHECK(yw > 0 && yh > 0, \"output must be at least 1x1\");\n  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, \"output is too large\");\n  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),\n                                 x.suggest_memory_format());\n\n  // Allocate sign tensor.\n  torch::Tensor so;\n  torch::Tensor s = si;\n  bool readSigns = !!s.numel();\n  int64_t sw_active = 0;  // Active width of sign tensor.\n  if (writeSigns) {\n    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.\n    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.\n    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,\n                                          // rounded up to multiple of 16.\n    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, \"signs is too large\");\n    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},\n                          x.options().dtype(torch::kUInt8),\n                          at::MemoryFormat::Contiguous);\n  } else if (readSigns)\n    sw_active = s.size(3) << 2;\n\n  // Validate sign tensor if in use.\n  if (readSigns || writeSigns) {\n    TORCH_CHECK(s.is_contiguous(), \"signs must be contiguous\");\n    TORCH_CHECK(s.dtype() == torch::kUInt8, \"signs must be uint8\");\n    TORCH_CHECK(s.device() == x.device(),\n                \"signs must reside on the same device as x\");\n    TORCH_CHECK(s.dim() == 4, \"signs must be rank 4\");\n    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),\n                \"signs must have same batch & channels as x\");\n    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,\n                \"signs is too large\");\n  }\n\n  // Populate rest of CUDA kernel parameters.\n  p.x = x.data_ptr();\n  p.y = y.data_ptr();\n  p.b = b.data_ptr();\n  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;\n  p.fu = fu.data_ptr<float>();\n  p.fd = fd.data_ptr<float>();\n  p.pad0 = make_int2(px0, py0);\n  p.gain = gain;\n  p.slope = slope;\n  p.clamp = clamp;\n  p.flip = (flip_filters) ? 1 : 0;\n  p.xShape =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.yShape =\n      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));\n  p.sShape = (readSigns || writeSigns)\n                 ? make_int2((int)s.size(3), (int)s.size(2))\n                 : make_int2(0, 0);  // Width is in bytes. Contiguous.\n  p.sOfs = make_int2(sx, sy);\n  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.\n\n  // x, y, b strides are in bytes.\n  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),\n                             sz * x.stride(1), sz * x.stride(0));\n  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),\n                             sz * y.stride(1), sz * y.stride(0));\n  p.bStride = sz * b.stride(0);\n\n  // fu, fd strides are in elements.\n  p.fuStride =\n      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);\n  p.fdStride =\n      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);\n\n  // Determine if indices don't fit in int32. Support negative strides although\n  // Torch currently never produces those.\n  bool index64b = false;\n  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;\n  if (std::min(x.size(0) * p.xStride.w, 0ll) +\n          std::min(x.size(1) * p.xStride.z, 0ll) +\n          std::min(x.size(2) * p.xStride.y, 0ll) +\n          std::min(x.size(3) * p.xStride.x, 0ll) <\n      -INT_MAX)\n    index64b = true;\n  if (std::max(x.size(0) * p.xStride.w, 0ll) +\n          std::max(x.size(1) * p.xStride.z, 0ll) +\n          std::max(x.size(2) * p.xStride.y, 0ll) +\n          std::max(x.size(3) * p.xStride.x, 0ll) >\n      INT_MAX)\n    index64b = true;\n  if (std::min(y.size(0) * p.yStride.w, 0ll) +\n          std::min(y.size(1) * p.yStride.z, 0ll) +\n          std::min(y.size(2) * p.yStride.y, 0ll) +\n          std::min(y.size(3) * p.yStride.x, 0ll) <\n      -INT_MAX)\n    index64b = true;\n  if (std::max(y.size(0) * p.yStride.w, 0ll) +\n          std::max(y.size(1) * p.yStride.z, 0ll) +\n          std::max(y.size(2) * p.yStride.y, 0ll) +\n          std::max(y.size(3) * p.yStride.x, 0ll) >\n      INT_MAX)\n    index64b = true;\n  if (s.numel() > INT_MAX) index64b = true;\n\n  // Choose CUDA kernel.\n  filtered_lrelu_kernel_spec spec = {0};\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      x.scalar_type(), \"filtered_lrelu_cuda\", [&] {\n        if constexpr (sizeof(scalar_t) <=\n                      4)  // Exclude doubles. constexpr\n                          // prevents template instantiation.\n        {\n          // Choose kernel based on index type, datatype and sign read/write\n          // modes.\n          if (!index64b && writeSigns && !readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(\n                p, sharedKB);\n          else if (!index64b && !writeSigns && readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(\n                p, sharedKB);\n          else if (!index64b && !writeSigns && !readSigns)\n            spec =\n                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(\n                    p, sharedKB);\n          else if (index64b && writeSigns && !readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(\n                p, sharedKB);\n          else if (index64b && !writeSigns && readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(\n                p, sharedKB);\n          else if (index64b && !writeSigns && !readSigns)\n            spec =\n                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(\n                    p, sharedKB);\n        }\n      });\n  TORCH_CHECK(\n      spec.exec,\n      \"internal error - CUDA kernel not found\")  // This should not happen\n                                                 // because we tested earlier\n                                                 // that kernel exists.\n\n  // Launch CUDA kernel.\n  void *args[] = {&p};\n  int bx = spec.numWarps * 32;\n  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;\n  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;\n  int gz = p.yShape.z * p.yShape.w;\n\n  // Repeat multiple horizontal tiles in a CTA?\n  if (spec.xrep) {\n    p.tilesXrep = spec.xrep;\n    p.tilesXdim = gx;\n\n    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;\n    std::swap(gx, gy);\n  } else {\n    p.tilesXrep = 0;\n    p.tilesXdim = 0;\n  }\n#ifdef MMCV_WITH_HIP\n  AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,\n                                at::cuda::getCurrentCUDAStream()));\n#else\n  // Launch filter setup kernel.\n  AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,\n                                 at::cuda::getCurrentCUDAStream()));\n#endif\n\n  // Copy kernels to constant memory.\n  if (writeSigns && !readSigns)\n    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));\n  else if (!writeSigns && readSigns)\n    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));\n  else if (!writeSigns && !readSigns)\n    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));\n\n  // Set cache and shared memory configurations for main kernel.\n  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));\n  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?\n#ifdef MMCV_WITH_HIP\n    AT_CUDA_CHECK(hipFuncSetAttribute(\n        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,\n        spec.dynamicSharedKB << 10));\n#else\n    AT_CUDA_CHECK(cudaFuncSetAttribute(\n        spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,\n        spec.dynamicSharedKB << 10));\n#endif\n  AT_CUDA_CHECK(\n      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));\n\n  // Launch main kernel.\n  const int maxSubGz = 65535;  // CUDA maximum for block z dimension.\n  for (int zofs = 0; zofs < gz;\n       zofs += maxSubGz)  // Do multiple launches if gz is too big.\n  {\n    p.blockZofs = zofs;\n    int subGz = std::min(maxSubGz, gz - zofs);\n#ifdef MMCV_WITH_HIP\n    AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,\n                                  spec.dynamicSharedKB << 10,\n                                  at::cuda::getCurrentCUDAStream()));\n#else\n    AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,\n                                   spec.dynamicSharedKB << 10,\n                                   at::cuda::getCurrentCUDAStream()));\n#endif\n  }\n\n  // Done.\n  return std::make_tuple(y, so, 0);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns);\n\nREGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, CUDA, filtered_lrelu_op);\n\n#else\n\n#pragma message(                           \\\n    \"filtered_lrelu_op is not available. \" \\\n    \"Please update your compiler and cuda version.\")\n\n#endif\n#undef BUILD_FILTERED_LRELU_OP\n\n//------------------------------------------------------------------------\n\ntorch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,\n                                    int sy, float gain, float slope,\n                                    float clamp, bool writeSigns) {\n  // Set CUDA device.\n  TORCH_CHECK(x.is_cuda(), \"x must reside on CUDA device\");\n  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));\n\n  // Validate arguments.\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&\n                  x.size(3) <= INT_MAX,\n              \"x is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x is empty\");\n  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||\n                  x.dtype() == torch::kDouble,\n              \"x must be float16, float32 or float64\");\n\n  // Output signs if we don't have sign input.\n  torch::Tensor so;\n  torch::Tensor s = si;\n  bool readSigns = !!s.numel();\n  if (writeSigns) {\n    int64_t sw = x.size(3);\n    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.\n    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},\n                          x.options().dtype(torch::kUInt8),\n                          at::MemoryFormat::Contiguous);\n  }\n\n  // Validate sign tensor if in use.\n  if (readSigns || writeSigns) {\n    TORCH_CHECK(s.is_contiguous(), \"signs must be contiguous\");\n    TORCH_CHECK(s.dtype() == torch::kUInt8, \"signs must be uint8\");\n    TORCH_CHECK(s.device() == x.device(),\n                \"signs must reside on the same device as x\");\n    TORCH_CHECK(s.dim() == 4, \"signs must be rank 4\");\n    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),\n                \"signs must have same batch & channels as x\");\n    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,\n                \"signs tensor is too large\");\n  }\n\n  // Initialize CUDA kernel parameters.\n  filtered_lrelu_act_kernel_params p;\n  p.x = x.data_ptr();\n  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;\n  p.gain = gain;\n  p.slope = slope;\n  p.clamp = clamp;\n  p.xShape =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.xStride =\n      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));\n  p.sShape = (readSigns || writeSigns)\n                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))\n                 : make_int2(0, 0);  // Width is in elements. Contiguous.\n  p.sOfs = make_int2(sx, sy);\n\n  // Choose CUDA kernel.\n  void *func = 0;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      x.scalar_type(), \"filtered_lrelu_act_cuda\", [&] {\n        if (writeSigns)\n          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();\n        else if (readSigns)\n          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();\n        else\n          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();\n      });\n  TORCH_CHECK(func, \"internal error - CUDA kernel not found\");\n\n  // Launch CUDA kernel.\n  void *args[] = {&p};\n  int bx = 128;  // 4 warps per block.\n\n  // Logical size of launch = writeSigns ? p.s : p.x\n  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;\n  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;\n  uint32_t gz =\n      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.\n  gx = (gx - 1) / bx + 1;\n\n  // Make sure grid y and z dimensions are within CUDA launch limits. Kernel\n  // loops internally to do the rest.\n  const uint32_t gmax = 65535;\n  gy = std::min(gy, gmax);\n  gz = std::min(gz, gmax);\n\n  // Launch.\n#ifdef MMCV_WITH_HIP\n  AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,\n                                at::cuda::getCurrentCUDAStream()));\n#else\n  AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,\n                                 at::cuda::getCurrentCUDAStream()));\n#endif\n\n  return so;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"sigmoid_focal_loss_cuda_kernel.cuh\"\n#include \"softmax_focal_loss_cuda_kernel.cuh\"\n\nvoid SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha) {\n  int output_size = output.numel();\n  int num_classes = input.size(1);\n  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,\n             \"target label should smaller or equal than num classes\");\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sigmoid_focal_loss_forward_cuda_kernel\", [&] {\n        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,\n                                                Tensor weight,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha) {\n  int output_size = grad_input.numel();\n  int num_classes = input.size(1);\n\n  at::cuda::CUDAGuard device_guard(grad_input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sigmoid_focal_loss_backward_cuda_kernel\", [&] {\n        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha) {\n  int output_size = output.numel();\n  int num_classes = softmax.size(1);\n\n  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,\n             \"target label should smaller or equal than num classes\");\n  at::cuda::CUDAGuard device_guard(softmax.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      softmax.scalar_type(), \"softmax_focal_loss_forward_cuda_kernel\", [&] {\n        softmax_focal_loss_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,\n                                                Tensor weight, Tensor buff,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha) {\n  int num_classes = softmax.size(1);\n\n  int output_size = buff.numel();\n  at::cuda::CUDAGuard device_guard(grad_input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_input.scalar_type(),\n      \"softmax_focal_loss_backward_cuda1_\"\n      \"kernel\",\n      [&] {\n        softmax_focal_loss_backward_cuda1_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  output_size = grad_input.numel();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_input.scalar_type(),\n      \"softmax_focal_loss_backward_cuda2_\"\n      \"kernel\",\n      [&] {\n        softmax_focal_loss_backward_cuda2_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),\n                grad_input.data_ptr<scalar_t>(), num_classes);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu\n\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"furthest_point_sample_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\ninline int opt_n_threads(int work_size) {\n  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);\n\n  return max(min(1 << pow_2, 1024), 1);\n}\n\nvoid FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,\n                                                    const float* dataset,\n                                                    float* temp, int* idxs) {\n  // dataset: (B, N, 3)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  unsigned int n_threads = opt_n_threads(n);\n\n  switch (n_threads) {\n    case 1024:\n      furthest_point_sampling_forward_cuda_kernel<1024>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 512:\n      furthest_point_sampling_forward_cuda_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 256:\n      furthest_point_sampling_forward_cuda_kernel<256>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 128:\n      furthest_point_sampling_forward_cuda_kernel<128>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 64:\n      furthest_point_sampling_forward_cuda_kernel<64>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 32:\n      furthest_point_sampling_forward_cuda_kernel<32>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 16:\n      furthest_point_sampling_forward_cuda_kernel<16>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 8:\n      furthest_point_sampling_forward_cuda_kernel<8>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 4:\n      furthest_point_sampling_forward_cuda_kernel<4>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 2:\n      furthest_point_sampling_forward_cuda_kernel<2>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 1:\n      furthest_point_sampling_forward_cuda_kernel<1>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    default:\n      furthest_point_sampling_forward_cuda_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid FurthestPointSamplingWithDistForwardCUDAKernelLauncher(\n    int b, int n, int m, const float* dataset, float* temp, int* idxs) {\n  // dataset: (B, N, N)\n  // temp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  unsigned int n_threads = opt_n_threads(n);\n\n  switch (n_threads) {\n    case 1024:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 512:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 256:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<256>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 128:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<128>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 64:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<64>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 32:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<32>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 16:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<16>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 8:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<8>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 4:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<4>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 2:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<2>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 1:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<1>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    default:\n      furthest_point_sampling_with_dist_forward_cuda_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu\n// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.\n//\n// This work is made available under the Nvidia Source Code License-NC.\n// To view a copy of this license, visit\n// https://nvlabs.github.io/stylegan2/license.html\n\n#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <torch/types.h>\n\n#include <ATen/cuda/CUDAApplyUtils.cuh>\n\ntemplate <typename scalar_t>\nstatic __global__ void fused_bias_act_kernel(\n    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,\n    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,\n    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {\n  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;\n\n  scalar_t zero = 0.0;\n\n  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;\n       loop_idx++, xi += blockDim.x) {\n    scalar_t x = p_x[xi];\n\n    if (use_bias) {\n      x += p_b[(xi / step_b) % size_b];\n    }\n\n    scalar_t ref = use_ref ? p_ref[xi] : zero;\n\n    scalar_t y;\n\n    // act = 1: linear layer\n    // act = 3: leaky relu layer\n    // grad = 0: direct forward path\n    // grad = 1: first order deviation\n    // grad = 2: second order deviation\n    switch (act * 10 + grad) {\n      default:\n      case 10:\n        y = x;\n        break;\n      case 11:\n        y = x;\n        break;\n      case 12:\n        y = 0.0;\n        break;\n\n      case 30:\n        y = (x > 0.0) ? x : x * alpha;\n        break;\n      case 31:\n        y = (ref > 0.0) ? x : x * alpha;\n        break;\n      case 32:\n        y = 0.0;\n        break;\n    }\n\n    out[xi] = y * scale;\n  }\n}\n\ntorch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,\n                                      const torch::Tensor& bias,\n                                      const torch::Tensor& refer, int act,\n                                      int grad, float alpha, float scale) {\n  int curDevice = -1;\n  cudaGetDevice(&curDevice);\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);\n\n  auto x = input.contiguous();\n  auto b = bias.contiguous();\n  auto ref = refer.contiguous();\n\n  int use_bias = b.numel() ? 1 : 0;\n  int use_ref = ref.numel() ? 1 : 0;\n\n  int size_x = x.numel();\n  int size_b = b.numel();\n  int step_b = 1;\n\n  for (int i = 1 + 1; i < x.dim(); i++) {\n    step_b *= x.size(i);\n  }\n\n  int loop_x = 4;\n  int block_size = 4 * 32;\n  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;\n\n  auto y = torch::empty_like(x);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      x.scalar_type(), \"fused_bias_act_kernel\", [&] {\n        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(\n            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),\n            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,\n            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);\n      });\n\n  return y;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu",
    "content": "#include <cuda_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/reordering.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n\ntorch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n\n  torch::Tensor output =\n      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {  // the center index of subm conv don't need gather and scatter\n               // add.\n    torch::mm_out(output, features, filters[indicePairMaxOffset]);\n  }\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        features.scalar_type(), \"FusedIndiceConvBatchnormKernel\", [&] {\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_CUDA_ERR();\n            /* slower than SparseGatherFunctor, may due to int->long conversion\n            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);\n            auto indicePairBlob =\n            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},\n            indicePairOptions); torch::index_select_out(inputBufferBlob,\n            features, 0, indicePairBlob);*/\n          }\n          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);\n\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n            TV_CHECK_CUDA_ERR();\n          }\n        });\n  }\n\n  return output;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n\n#include \"gather_points_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           const Tensor points,\n                                           const Tensor idx, Tensor out) {\n  // points: (B, C, N)\n  // idx: (B, npoints)\n  // output:\n  //      out: (B, C, npoints)\n\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"gather_points_forward_cuda_kernel\", [&] {\n        gather_points_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, points.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                            const Tensor grad_out,\n                                            const Tensor idx,\n                                            Tensor grad_points) {\n  // grad_out: (B, C, npoints)\n  // idx: (B, npoints)\n  // output:\n  //      grad_points: (B, C, N)\n\n  at::cuda::CUDAGuard device_guard(grad_out.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"gather_points_backward_cuda_kernel\", [&] {\n        gather_points_backward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"group_points_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                          int nsample, const Tensor points,\n                                          const Tensor idx, Tensor out) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"group_points_forward_cuda_kernel\", [&] {\n        group_points_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,\n                                           int nsample, const Tensor grad_out,\n                                           const Tensor idx,\n                                           Tensor grad_points) {\n  // grad_out: (B, C, npoints, nsample)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      grad_points: (B, C, N)\n\n  at::cuda::CUDAGuard device_guard(grad_out.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"group_points_backward_cuda_kernel\", [&] {\n        group_points_backward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu",
    "content": "// Modified from\n// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu\n\n/*\n3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)\nWritten by Shaoshuai Shi\nAll Rights Reserved 2019-2020.\n*/\n\n#include <stdio.h>\n\n#include \"iou3d_cuda_kernel.cuh\"\n#include \"nms_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,\n                                                   const Tensor boxes_a,\n                                                   const int num_b,\n                                                   const Tensor boxes_b,\n                                                   Tensor ans_overlap) {\n  at::cuda::CUDAGuard device_guard(boxes_a.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),\n              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));\n  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);\n\n  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(\n      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),\n      ans_overlap.data_ptr<float>());\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,\n                                         Tensor& keep_num,\n                                         float nms_overlap_thresh) {\n  using namespace at::indexing;\n  at::cuda::CUDAGuard device_guard(boxes.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  int boxes_num = boxes.size(0);\n\n  const int col_blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n\n  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),\n              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));\n  dim3 threads(THREADS_PER_BLOCK_NMS);\n\n  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(\n      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n\n  auto keep_data = keep_t.nonzero().index({Slice(), 0});\n  keep_num.fill_(at::Scalar(keep_data.size(0)));\n  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,\n                                               Tensor& keep_num,\n                                               float nms_overlap_thresh) {\n  using namespace at::indexing;\n  at::cuda::CUDAGuard device_guard(boxes.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  int boxes_num = boxes.size(0);\n\n  const int col_blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n\n  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),\n              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));\n  dim3 threads(THREADS_PER_BLOCK_NMS);\n\n  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(\n      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n\n  auto keep_data = keep_t.nonzero().index({Slice(), 0});\n  keep_num.fill_(at::Scalar(keep_data.size(0)));\n  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n\n#include <cmath>\n#include <cstdio>\n\n#include \"knn_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,\n                                  const Tensor xyz, const Tensor new_xyz,\n                                  Tensor idx, Tensor dist2) {\n  // param new_xyz: (B, m, 3)\n  // param xyz: (B, n, 3)\n  // param idx: (B, m, nsample)\n\n  at::cuda::CUDAGuard device_guard(new_xyz.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      new_xyz.scalar_type(), \"knn_forward_cuda_kernel\", [&] {\n        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            b, n, m, nsample, xyz.data_ptr<scalar_t>(),\n            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n            dist2.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"masked_conv2d_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int kernel_h,\n                                           const int kernel_w, const int pad_h,\n                                           const int pad_w) {\n  int channels = bottom_data.size(1);\n  int height = bottom_data.size(2);\n  int width = bottom_data.size(3);\n  int mask_cnt = mask_h_idx.size(0);\n  int output_size = mask_cnt * channels;\n\n  at::cuda::CUDAGuard device_guard(bottom_data.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      bottom_data.scalar_type(), \"MaskedIm2colLaucherForward\", ([&] {\n        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();\n        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();\n        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();\n        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();\n        MaskedIm2colForward<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data_, height, width, kernel_h, kernel_w,\n                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid MaskedCol2imForwardCUDAKernelLauncher(\n    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,\n    Tensor top_data, const int height, const int width, const int channels) {\n  int mask_cnt = mask_h_idx.size(0);\n  int output_size = mask_cnt * channels;\n\n  at::cuda::CUDAGuard device_guard(bottom_data.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      bottom_data.scalar_type(), \"MaskedCol2imLaucherForward\", ([&] {\n        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();\n        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();\n        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();\n        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();\n\n        MaskedCol2imForward<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data_, height, width, channels, mask_h_idx_,\n                mask_w_idx_, mask_cnt, top_data_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu\n#include \"min_area_polygons_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,\n                                       Tensor polygons) {\n  int num_pointsets = pointsets.size(0);\n  const int output_size = polygons.numel();\n  at::cuda::CUDAGuard device_guard(pointsets.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      pointsets.scalar_type(), \"min_area_polygons_cuda_kernel\", ([&] {\n        min_area_polygons_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                num_pointsets, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>());\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"modulated_deform_conv_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid modulated_deformable_im2col_cuda(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col) {\n  // num_axes should be smaller than block size\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels = channels * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"modulated_deformable_im2col_gpu\", ([&] {\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n\n        modulated_deformable_im2col_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_im_, data_offset_, data_mask_, height_im,\n            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, batch_size,\n            channels, deformable_group, height_col, width_col, data_col_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid modulated_deformable_col2im_cuda(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im) {\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels =\n      channels * kernel_h * kernel_w * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_col_, data_offset_, data_mask_, channels,\n            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,\n            stride_w, dilation_h, dilation_w, channel_per_deformable_group,\n            batch_size, deformable_group, height_col, width_col, grad_im_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid modulated_deformable_col2im_coord_cuda(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask) {\n  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *\n                          kernel_w * deformable_group;\n  const int channel_per_deformable_group =\n      channels * kernel_h * kernel_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_coord_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_coord_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            at::cuda::getCurrentCUDAStream()>>>(\n            num_kernels, data_col_, data_im_, data_offset_, data_mask_,\n            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,\n            stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, batch_size,\n            2 * kernel_h * kernel_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_, grad_mask_);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n#include <THC/THCAtomics.cuh>\n#include <vector>\n\n#include \"ms_deform_attn_cuda_kernel.cuh\"\n\ntemplate <typename scalar_t>\nvoid ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,\n                               const int64_t *data_spatial_shapes,\n                               const int64_t *data_level_start_index,\n                               const scalar_t *data_sampling_loc,\n                               const scalar_t *data_attn_weight,\n                               const int batch_size, const int spatial_size,\n                               const int num_heads, const int channels,\n                               const int num_levels, const int num_query,\n                               const int num_point, scalar_t *data_col) {\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  const int num_threads = THREADS_PER_BLOCK;\n  ms_deformable_im2col_gpu_kernel<scalar_t>\n      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(\n          num_kernels, data_value, data_spatial_shapes, data_level_start_index,\n          data_sampling_loc, data_attn_weight, batch_size, spatial_size,\n          num_heads, channels, num_levels, num_query, num_point, data_col);\n\n  cudaError_t err = cudaGetLastError();\n  if (err != cudaSuccess) {\n    printf(\"error in ms_deformable_im2col_cuda: %s\\n\", cudaGetErrorString(err));\n  }\n}\n\ntemplate <typename scalar_t>\nvoid ms_deformable_col2im_cuda(\n    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  const int num_threads =\n      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  if (channels > THREADS_PER_BLOCK) {\n    if ((channels & THREADS_PER_BLOCK - 1) == 0) {\n      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n             num_threads * 3 * sizeof(scalar_t), stream>>>(\n              num_kernels, grad_col, data_value, data_spatial_shapes,\n              data_level_start_index, data_sampling_loc, data_attn_weight,\n              batch_size, spatial_size, num_heads, channels, num_levels,\n              num_query, num_point, grad_value, grad_sampling_loc,\n              grad_attn_weight);\n    } else {\n      ms_deformable_col2im_gpu_kernel_gm<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, spatial_size, num_heads,\n                       channels, num_levels, num_query, num_point, grad_value,\n                       grad_sampling_loc, grad_attn_weight);\n    }\n  } else {\n    switch (channels) {\n      case 1:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      1>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 2:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      2>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 4:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      4>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 8:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      8>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 16:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      16>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 32:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      32>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 64:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      64>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 128:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      128>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 256:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      256>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 512:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      512>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      default:\n        if (channels < 64) {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>\n              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n                 num_threads * 3 * sizeof(scalar_t), stream>>>(\n                  num_kernels, grad_col, data_value, data_spatial_shapes,\n                  data_level_start_index, data_sampling_loc, data_attn_weight,\n                  batch_size, spatial_size, num_heads, channels, num_levels,\n                  num_query, num_point, grad_value, grad_sampling_loc,\n                  grad_attn_weight);\n        } else {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>\n              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n                 num_threads * 3 * sizeof(scalar_t), stream>>>(\n                  num_kernels, grad_col, data_value, data_spatial_shapes,\n                  data_level_start_index, data_sampling_loc, data_attn_weight,\n                  batch_size, spatial_size, num_heads, channels, num_levels,\n                  num_query, num_point, grad_value, grad_sampling_loc,\n                  grad_attn_weight);\n        }\n    }\n  }\n  cudaError_t err = cudaGetLastError();\n  if (err != cudaSuccess) {\n    printf(\"error in ms_deformable_col2im_cuda: %s\\n\", cudaGetErrorString(err));\n  }\n}\n\nat::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,\n                                       const at::Tensor &spatial_shapes,\n                                       const at::Tensor &level_start_index,\n                                       const at::Tensor &sampling_loc,\n                                       const at::Tensor &attn_weight,\n                                       const int im2col_step) {\n  AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n  AT_ASSERTM(spatial_shapes.is_contiguous(),\n             \"spatial_shapes tensor has to be contiguous\");\n  AT_ASSERTM(level_start_index.is_contiguous(),\n             \"level_start_index tensor has to be contiguous\");\n  AT_ASSERTM(sampling_loc.is_contiguous(),\n             \"sampling_loc tensor has to be contiguous\");\n  AT_ASSERTM(attn_weight.is_contiguous(),\n             \"attn_weight tensor has to be contiguous\");\n\n  AT_ASSERTM(value.is_cuda(), \"value must be a CUDA tensor\");\n  AT_ASSERTM(spatial_shapes.is_cuda(), \"spatial_shapes must be a CUDA tensor\");\n  AT_ASSERTM(level_start_index.is_cuda(),\n             \"level_start_index must be a CUDA tensor\");\n  AT_ASSERTM(sampling_loc.is_cuda(), \"sampling_loc must be a CUDA tensor\");\n  AT_ASSERTM(attn_weight.is_cuda(), \"attn_weight must be a CUDA tensor\");\n\n  const int batch = value.size(0);\n  const int spatial_size = value.size(1);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n\n  const int num_levels = spatial_shapes.size(0);\n\n  const int num_query = sampling_loc.size(1);\n  const int num_point = sampling_loc.size(4);\n\n  const int im2col_step_ = std::min(batch, im2col_step);\n\n  AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\",\n             batch, im2col_step_);\n\n  auto output =\n      at::zeros({batch, num_query, num_heads, channels}, value.options());\n\n  const int batch_n = im2col_step_;\n  auto output_n = output.view(\n      {batch / im2col_step_, batch_n, num_query, num_heads, channels});\n  auto per_value_size = spatial_size * num_heads * channels;\n  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n  for (int n = 0; n < batch / im2col_step_; ++n) {\n    auto columns = output_n.select(0, n);\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        value.scalar_type(), \"ms_deform_attn_forward_cuda\", ([&] {\n          ms_deformable_im2col_cuda(\n              at::cuda::getCurrentCUDAStream(),\n              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,\n              spatial_shapes.data_ptr<int64_t>(),\n              level_start_index.data_ptr<int64_t>(),\n              sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size,\n              batch_n, spatial_size, num_heads, channels, num_levels, num_query,\n              num_point, columns.data_ptr<scalar_t>());\n        }));\n  }\n\n  output = output.view({batch, num_query, num_heads * channels});\n\n  return output;\n}\n\nvoid ms_deform_attn_cuda_backward(\n    const at::Tensor &value, const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight, const at::Tensor &grad_output,\n    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,\n    at::Tensor &grad_attn_weight, const int im2col_step) {\n  AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n  AT_ASSERTM(spatial_shapes.is_contiguous(),\n             \"spatial_shapes tensor has to be contiguous\");\n  AT_ASSERTM(level_start_index.is_contiguous(),\n             \"level_start_index tensor has to be contiguous\");\n  AT_ASSERTM(sampling_loc.is_contiguous(),\n             \"sampling_loc tensor has to be contiguous\");\n  AT_ASSERTM(attn_weight.is_contiguous(),\n             \"attn_weight tensor has to be contiguous\");\n  AT_ASSERTM(grad_output.is_contiguous(),\n             \"grad_output tensor has to be contiguous\");\n\n  AT_ASSERTM(value.is_cuda(), \"value must be a CUDA tensor\");\n  AT_ASSERTM(spatial_shapes.is_cuda(), \"spatial_shapes must be a CUDA tensor\");\n  AT_ASSERTM(level_start_index.is_cuda(),\n             \"level_start_index must be a CUDA tensor\");\n  AT_ASSERTM(sampling_loc.is_cuda(), \"sampling_loc must be a CUDA tensor\");\n  AT_ASSERTM(attn_weight.is_cuda(), \"attn_weight must be a CUDA tensor\");\n  AT_ASSERTM(grad_output.is_cuda(), \"grad_output must be a CUDA tensor\");\n\n  const int batch = value.size(0);\n  const int spatial_size = value.size(1);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n\n  const int num_levels = spatial_shapes.size(0);\n\n  const int num_query = sampling_loc.size(1);\n  const int num_point = sampling_loc.size(4);\n\n  const int im2col_step_ = std::min(batch, im2col_step);\n\n  AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\",\n             batch, im2col_step_);\n\n  const int batch_n = im2col_step_;\n  auto per_value_size = spatial_size * num_heads * channels;\n  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n  auto grad_output_n = grad_output.view(\n      {batch / im2col_step_, batch_n, num_query, num_heads, channels});\n\n  for (int n = 0; n < batch / im2col_step_; ++n) {\n    auto grad_output_g = grad_output_n.select(0, n);\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        value.scalar_type(), \"ms_deform_attn_backward_cuda\", ([&] {\n          ms_deformable_col2im_cuda(\n              at::cuda::getCurrentCUDAStream(),\n              grad_output_g.data_ptr<scalar_t>(),\n              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,\n              spatial_shapes.data_ptr<int64_t>(),\n              level_start_index.data_ptr<int64_t>(),\n              sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size,\n              batch_n, spatial_size, num_heads, channels, num_levels, num_query,\n              num_point,\n              grad_value.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_value_size,\n              grad_sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              grad_attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size);\n        }));\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"nms_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nTensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                             int offset) {\n  at::cuda::CUDAGuard device_guard(boxes.device());\n\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));\n  auto boxes_sorted = boxes.index_select(0, order_t);\n\n  int boxes_num = boxes.size(0);\n  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;\n  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n  dim3 blocks(col_blocks_alloc, col_blocks_alloc);\n  dim3 threads(threadsPerBlock);\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  nms_cuda<<<blocks, threads, 0, stream>>>(\n      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  // Filter the boxes which should be kept.\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n  AT_CUDA_CHECK(cudaGetLastError());\n  return order_t.masked_select(keep_t);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"nms_quadri_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nTensor nms_quadri_cuda(const Tensor dets, const Tensor scores,\n                       const Tensor order_t, const Tensor dets_sorted,\n                       float iou_threshold, const int multi_label) {\n  // using scalar_t = float;\n  AT_ASSERTM(dets.is_cuda(), \"dets must be a CUDA tensor\");\n  AT_ASSERTM(scores.is_cuda(), \"scores must be a CUDA tensor\");\n  at::cuda::CUDAGuard device_guard(dets.device());\n\n  int dets_num = dets.size(0);\n\n  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);\n\n  Tensor mask =\n      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));\n\n  dim3 blocks(col_blocks, col_blocks);\n  dim3 threads(threadsPerBlock);\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      dets_sorted.scalar_type(), \"nms_quadri_kernel_cuda\", [&] {\n        nms_quadri_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),\n            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);\n      });\n\n  Tensor mask_cpu = mask.to(at::kCPU);\n  unsigned long long* mask_host =\n      (unsigned long long*)mask_cpu.data_ptr<int64_t>();\n\n  std::vector<unsigned long long> remv(col_blocks);\n  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);\n\n  Tensor keep =\n      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));\n  int64_t* keep_out = keep.data_ptr<int64_t>();\n\n  int num_to_keep = 0;\n  for (int i = 0; i < dets_num; i++) {\n    int nblock = i / threadsPerBlock;\n    int inblock = i % threadsPerBlock;\n\n    if (!(remv[nblock] & (1ULL << inblock))) {\n      keep_out[num_to_keep++] = i;\n      unsigned long long* p = mask_host + i * col_blocks;\n      for (int j = nblock; j < col_blocks; j++) {\n        remv[j] |= p[j];\n      }\n    }\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n  return order_t.index(\n      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)\n           .to(order_t.device(), keep.scalar_type())});\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu\n#include \"nms_rotated_cuda.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nTensor nms_rotated_cuda(const Tensor dets, const Tensor scores,\n                        const Tensor order_t, const Tensor dets_sorted,\n                        float iou_threshold, const int multi_label) {\n  // using scalar_t = float;\n  AT_ASSERTM(dets.is_cuda(), \"dets must be a CUDA tensor\");\n  AT_ASSERTM(scores.is_cuda(), \"scores must be a CUDA tensor\");\n  at::cuda::CUDAGuard device_guard(dets.device());\n\n  int dets_num = dets.size(0);\n\n  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);\n\n  Tensor mask =\n      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));\n\n  dim3 blocks(col_blocks, col_blocks);\n  dim3 threads(threadsPerBlock);\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      dets_sorted.scalar_type(), \"nms_rotated_kernel_cuda\", [&] {\n        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),\n            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);\n      });\n\n  Tensor mask_cpu = mask.to(at::kCPU);\n  unsigned long long* mask_host =\n      (unsigned long long*)mask_cpu.data_ptr<int64_t>();\n\n  std::vector<unsigned long long> remv(col_blocks);\n  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);\n\n  Tensor keep =\n      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));\n  int64_t* keep_out = keep.data_ptr<int64_t>();\n\n  int num_to_keep = 0;\n  for (int i = 0; i < dets_num; i++) {\n    int nblock = i / threadsPerBlock;\n    int inblock = i % threadsPerBlock;\n\n    if (!(remv[nblock] & (1ULL << inblock))) {\n      keep_out[num_to_keep++] = i;\n      unsigned long long* p = mask_host + i * col_blocks;\n      for (int j = nblock; j < col_blocks; j++) {\n        remv[j] |= p[j];\n      }\n    }\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n  return order_t.index(\n      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)\n           .to(order_t.device(), keep.scalar_type())});\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu\n// Written by Shaoshuai Shi\n// All Rights Reserved 2019.\n\n#include <stdio.h>\n\n#include \"points_in_boxes_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                                int pts_num, const Tensor boxes,\n                                                const Tensor pts,\n                                                Tensor box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is\n  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,\n  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default\n  // -1\n\n  at::cuda::CUDAGuard device_guard(boxes.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      boxes.scalar_type(), \"points_in_boxes_part_forward_cuda_kernel\", [&] {\n        points_in_boxes_part_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),\n                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,\n                                               int pts_num, const Tensor boxes,\n                                               const Tensor pts,\n                                               Tensor box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)\n  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),\n  // default -1\n\n  at::cuda::CUDAGuard device_guard(boxes.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      boxes.scalar_type(), \"points_in_boxes_all_forward_cuda_kernel\", [&] {\n        points_in_boxes_all_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),\n                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu\n\n#include <stdio.h>\n\n#include \"points_in_polygons_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,\n                                               const at::Tensor polygons,\n                                               const int rows, const int cols,\n                                               at::Tensor output) {\n  const int output_size = rows * cols;\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"points_in_polygons_forward_cuda_kernel\", ([&] {\n        const scalar_t *vertex1 = points.data_ptr<scalar_t>();\n        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();\n        scalar_t *inside_flag = output.data_ptr<scalar_t>();\n\n        points_in_polygons_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, vertex1, vertex2, rows, cols, inside_flag);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"prroi_pool_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,\n                                        Tensor output, int pooled_height,\n                                        int pooled_width, float spatial_scale) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  prroi_pool_forward_cuda_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),\n          output.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                         Tensor grad_input, int pooled_height,\n                                         int pooled_width,\n                                         float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  prroi_pool_backward_cuda_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),\n          grad_input.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,\n                                             Tensor input, Tensor rois,\n                                             Tensor grad_rois,\n                                             int pooled_height,\n                                             int pooled_width,\n                                             float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  prroi_pool_coor_backward_cuda_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),\n          input.data_ptr<float>(), rois.data_ptr<float>(),\n          grad_rois.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/hszhao/semseg/blob/master/lib/psa/src\n\n#include <torch/serialize/tensor.h>\n\n#include \"psamask_cuda_kernel.cuh\"\n#include \"pytorch_cuda_helper.hpp\"\n\nvoid PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,\n                                      Tensor output, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask) {\n  int nthreads = num_ * h_feature * w_feature;\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  if (psa_type == 0)\n    AT_DISPATCH_FLOATING_TYPES(\n        input.scalar_type(), \"psamask_collect_forward_cuda\", [&] {\n          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(\n              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n              half_w_mask, input.data_ptr<scalar_t>(),\n              output.data_ptr<scalar_t>());\n        });\n  else\n    AT_DISPATCH_FLOATING_TYPES(\n        input.scalar_type(), \"psamask_distribute_forward_cuda\", [&] {\n          psamask_distribute_forward_cuda<scalar_t>\n              <<<nthreads, 512, 0, stream>>>(\n                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                  half_w_mask, input.data_ptr<scalar_t>(),\n                  output.data_ptr<scalar_t>());\n        });\n}\n\nvoid PSAMaskBackwardCUDAKernelLauncher(\n    const int psa_type, const Tensor grad_output, Tensor grad_input,\n    const int num_, const int h_feature, const int w_feature, const int h_mask,\n    const int w_mask, const int half_h_mask, const int half_w_mask) {\n  int nthreads = num_ * h_feature * w_feature;\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  if (psa_type == 0)\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_input.scalar_type(), \"psamask_collect_backward_cuda\", [&] {\n          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(\n              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n              half_w_mask, grad_output.data_ptr<scalar_t>(),\n              grad_input.data_ptr<scalar_t>());\n        });\n  else\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_input.scalar_type(), \"psamask_distribute_backward_cuda\", [&] {\n          psamask_distribute_backward_cuda<scalar_t>\n              <<<nthreads, 512, 0, stream>>>(\n                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                  half_w_mask, grad_output.data_ptr<scalar_t>(),\n                  grad_input.data_ptr<scalar_t>());\n        });\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"riroi_align_rotated_cuda_kernel.cuh\"\n\nvoid RiROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor features, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor output) {\n  const int output_size =\n      num_rois * pooled_height * pooled_width * channels * num_orientations;\n  at::cuda::CUDAGuard device_guard(features.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"riroi_align_rotated_forward_cuda_kernel\", ([&] {\n        const scalar_t *bottom_data = features.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n\n        riroi_align_rotated_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data, rois_data, scalar_t(spatial_scale),\n                num_samples, clockwise, channels, height, width, pooled_height,\n                pooled_width, num_orientations, top_data);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid RiROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor bottom_grad) {\n  const int output_size =\n      num_rois * pooled_height * pooled_width * channels * num_orientations;\n  at::cuda::CUDAGuard device_guard(top_grad.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"riroi_align_rotated_backward_cuda_kernel\", ([&] {\n        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();\n        riroi_align_rotated_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, top_diff, rois_data, spatial_scale, num_samples,\n                clockwise, channels, height, width, pooled_height, pooled_width,\n                num_orientations, bottom_diff);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"roi_align_cuda_kernel.cuh\"\n\nvoid ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"roi_align_forward_cuda_kernel\", [&] {\n        roi_align_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,\n                aligned, channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                        Tensor argmax_y, Tensor argmax_x,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, int pool_mode,\n                                        bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"roi_align_backward_cuda_kernel\", [&] {\n        roi_align_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),\n                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,\n                aligned, channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"roi_align_rotated_cuda_kernel.cuh\"\n\nvoid ROIAlignRotatedForwardCUDAKernelLauncher(\n    const at::Tensor input, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor output) {\n  const int output_size = num_rois * pooled_height * pooled_width * channels;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"ROIAlignRotatedLaucherForward\", ([&] {\n        const scalar_t *bottom_data = input.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n\n        roi_align_rotated_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(\n                output_size, bottom_data, rois_data, scalar_t(spatial_scale),\n                sampling_ratio, aligned, clockwise, channels, height, width,\n                pooled_height, pooled_width, top_data);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ROIAlignRotatedBackwardCUDAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {\n  const int output_size = num_rois * pooled_height * pooled_width * channels;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"ROIAlignLaucherBackward\", ([&] {\n        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();\n        roi_align_rotated_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(\n                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,\n                aligned, clockwise, channels, height, width, pooled_height,\n                pooled_width, bottom_diff);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"roi_pool_cuda_kernel.cuh\"\n\nvoid ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax, int pooled_height,\n                                      int pooled_width, float spatial_scale) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"roi_pool_forward_cuda_kernel\", [&] {\n        roi_pool_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax.data_ptr<int>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,\n                                       Tensor argmax, Tensor grad_input,\n                                       int pooled_height, int pooled_width,\n                                       float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"roi_pool_backward_cuda_kernel\", [&] {\n        roi_pool_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),\n                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                channels, height, width);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu\n// Written by Shaoshuai Shi\n// All Rights Reserved 2019.\n\n#include <stdio.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"roiaware_pool3d_cuda_kernel.cuh\"\n\nvoid RoiawarePool3dForwardCUDAKernelLauncher(\n    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,\n    int out_y, int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,\n    Tensor pooled_features, int pool_method) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params\n  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params\n  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params\n  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:\n  // max_pool 1: avg_pool\n\n  at::cuda::CUDAGuard device_guard(pts_feature.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  Tensor pts_mask =\n      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));\n\n  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      rois.scalar_type(), \"generate_pts_mask_for_box3d\", [&] {\n        generate_pts_mask_for_box3d<scalar_t>\n            <<<blocks_mask, threads, 0, stream>>>(\n                boxes_num, pts_num, out_x, out_y, out_z,\n                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),\n                pts_mask.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  // TODO: Merge the collect and pool functions, SS\n\n  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));\n\n  AT_DISPATCH_INTEGRAL_TYPES(\n      pts_idx_of_voxels.scalar_type(), \"collect_inside_pts_for_box3d\", [&] {\n        collect_inside_pts_for_box3d<scalar_t>\n            <<<blocks_collect, threads, 0, stream>>>(\n                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,\n                pts_mask.data_ptr<int>(),\n                pts_idx_of_voxels.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),\n                   channels, boxes_num);\n  if (pool_method == 0) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        pts_feature.scalar_type(), \"roiaware_maxpool3d\", [&] {\n          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,\n              out_z, pts_feature.data_ptr<scalar_t>(),\n              pts_idx_of_voxels.data_ptr<int>(),\n              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());\n        });\n  } else if (pool_method == 1) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        pts_feature.scalar_type(), \"roiaware_avgpool3d\", [&] {\n          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,\n              out_z, pts_feature.data_ptr<scalar_t>(),\n              pts_idx_of_voxels.data_ptr<int>(),\n              pooled_features.data_ptr<scalar_t>());\n        });\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid RoiawarePool3dBackwardCUDAKernelLauncher(\n    int boxes_num, int out_x, int out_y, int out_z, int channels,\n    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,\n    const Tensor grad_out, Tensor grad_in, int pool_method) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n  // params pool_method: 0: max_pool, 1: avg_pool\n\n  at::cuda::CUDAGuard device_guard(grad_out.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,\n              boxes_num);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  if (pool_method == 0) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        grad_in.scalar_type(), \"roiaware_maxpool3d_backward\", [&] {\n          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(\n              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),\n              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());\n        });\n  } else if (pool_method == 1) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        grad_in.scalar_type(), \"roiaware_avgpool3d_backward\", [&] {\n          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(\n              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,\n              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),\n              grad_in.data_ptr<scalar_t>());\n        });\n  }\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu",
    "content": "/*\nModified from\nhttps://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu\nPoint cloud feature pooling\nWritten by Shaoshuai Shi\nAll Rights Reserved 2018.\n*/\n\n#include <math.h>\n#include <stdio.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"roipoint_pool3d_cuda_kernel.cuh\"\n\nvoid RoIPointPool3dForwardCUDAKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features,\n    Tensor pooled_empty_flag) {\n  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},\n                                boxes3d.options().dtype(at::kInt));\n\n  at::cuda::CUDAGuard device_guard(xyz.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz.scalar_type(), \"assign_pts_to_box3d\", [&] {\n        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(\n            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),\n            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());\n      });\n\n  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},\n                             boxes3d.options().dtype(at::kInt));\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);\n\n  get_pooled_idx<<<blocks2, threads, 0, stream>>>(\n      batch_size, pts_num, boxes_num, sampled_pts_num,\n      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),\n      pooled_empty_flag.data_ptr<int>());\n\n  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,\n                   batch_size);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz.scalar_type(), \"roipoint_pool3d_forward\", [&] {\n        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,\n            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),\n            pts_feature.data_ptr<scalar_t>(),\n            pooled_features.data_ptr<scalar_t>(),\n            pooled_empty_flag.data_ptr<int>());\n      });\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu\n#include \"pytorch_cuda_helper.hpp\"\n#include \"rotated_feature_align_cuda_kernel.cuh\"\n\nvoid RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor output) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  const int output_size = features.numel();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"rotated_feature_align_forward_cuda_kernel\",\n      ([&] {\n        const scalar_t* bottom_data = features.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* top_data = output.data_ptr<scalar_t>();\n\n        rotated_feature_align_forward_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, points, bottom_data, bboxes_data,\n                scalar_t(spatial_scale), features.size(1), features.size(2),\n                features.size(3), top_data);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,\n                                                   const Tensor best_bboxes,\n                                                   const float spatial_scale,\n                                                   const int points,\n                                                   Tensor bottom_grad) {\n  at::cuda::CUDAGuard device_guard(top_grad.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  const int output_size = top_grad.numel();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"rotated_feature_align_backward_cuda_kernel\",\n      ([&] {\n        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();\n\n        rotated_feature_align_backward_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, points, top_diff, bboxes_data,\n                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),\n                top_grad.size(3), bottom_diff);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <stdio.h>\n#include <stdlib.h>\n#include <torch/types.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"scatter_points_cuda_kernel.cuh\"\n\nstd::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(\n    const at::Tensor &feats, const at::Tensor &coors,\n    const reduce_t reduce_type) {\n  const int num_input = feats.size(0);\n  const int num_feats = feats.size(1);\n\n  if (num_input == 0)\n    return {feats.clone().detach(), coors.clone().detach(),\n            coors.new_empty({0}, torch::kInt32),\n            coors.new_empty({0}, torch::kInt32)};\n\n  at::Tensor out_coors;\n  at::Tensor coors_map;\n  at::Tensor reduce_count;\n\n  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);\n\n  std::tie(out_coors, coors_map, reduce_count) =\n      at::unique_dim(coors_clean, 0, true, true, true);\n\n  if (out_coors[0][0].lt(0).item<bool>()) {\n    // the first element of out_coors (-1,-1,-1) and should be removed\n    out_coors = out_coors.slice(0, 1);\n    reduce_count = reduce_count.slice(0, 1);\n    coors_map = coors_map - 1;\n  }\n\n  coors_map = coors_map.to(torch::kInt32);\n  reduce_count = reduce_count.to(torch::kInt32);\n\n  auto reduced_feats =\n      at::empty({out_coors.size(0), num_feats}, feats.options());\n\n  at::cuda::CUDAGuard device_guard(feats.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  AT_DISPATCH_FLOATING_TYPES(\n      feats.scalar_type(), \"feats_reduce_kernel\", ([&] {\n        if (reduce_type == reduce_t::MAX)\n          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());\n        else\n          reduced_feats.fill_(static_cast<scalar_t>(0));\n\n        dim3 blocks(std::min(\n            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n        dim3 threads(THREADS_PER_BLOCK);\n        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(\n            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),\n            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,\n            reduce_type);\n        if (reduce_type == reduce_t::MEAN)\n          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  return {reduced_feats, out_coors, coors_map, reduce_count};\n}\n\nvoid DynamicPointToVoxelBackwardCUDAKernelLauncher(\n    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,\n    const at::Tensor &feats, const at::Tensor &reduced_feats,\n    const at::Tensor &coors_map, const at::Tensor &reduce_count,\n    const reduce_t reduce_type) {\n  const int num_input = feats.size(0);\n  const int num_reduced = reduced_feats.size(0);\n  const int num_feats = feats.size(1);\n\n  grad_feats.fill_(0);\n  // copy voxel grad to points\n\n  if (num_input == 0 || num_reduced == 0) return;\n  at::cuda::CUDAGuard device_guard(feats.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(), \"add_reduce_traceback_grad_kernel\",\n        ([&] {\n          dim3 blocks(std::min(\n              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(\n              grad_feats.data_ptr<scalar_t>(),\n              grad_reduced_feats.data_ptr<scalar_t>(),\n              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),\n              num_input, num_feats, reduce_type);\n        }));\n\n    AT_CUDA_CHECK(cudaGetLastError());\n  } else {\n    auto reduce_from = at::full({num_reduced, num_feats}, num_input,\n                                coors_map.options().dtype(torch::kInt32));\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(),\n        \"max_reduce_traceback_scatter_idx_kernel\", ([&] {\n          dim3 blocks(std::min(\n              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,\n                                                    stream>>>(\n              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),\n              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),\n              num_input, num_feats);\n        }));\n\n    AT_CUDA_CHECK(cudaGetLastError());\n\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(),\n        \"max_reduce_traceback_scatter_idx_kernel\", ([&] {\n          dim3 blocks(\n              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),\n                       maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(\n              grad_feats.data_ptr<scalar_t>(),\n              grad_reduced_feats.data_ptr<scalar_t>(),\n              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);\n        }));\n\n    AT_CUDA_CHECK(cudaGetLastError());\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <spconv/indice.cuh>\n#include <type_traits>\n\n#include \"pytorch_cuda_helper.hpp\"\n\nnamespace functor {\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose) {\n    Index batchSize = gridsOut.dim(0);\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    if (transpose)\n      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,\n                              indiceNum, indicePairUnique, kernelSize, stride,\n                              padding, dilation, outSpatialShape);\n    else\n      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,\n                              indiceNum, indicePairUnique, kernelSize, stride,\n                              padding, dilation, outSpatialShape);\n    TV_CHECK_CUDA_ERR();\n    return 1;\n  }\n};\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    Index batchSize = gridsOut.dim(0);\n    auto kernelVolume = indicePairs.dim(0);\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    Index numAct = indicePairUnique.dim(0) - 1;\n    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,\n                            indicePairUnique, outSpatialShape, batchSize);\n    TV_CHECK_CUDA_ERR();\n    assignIndicePairsKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,\n                            indicePairUnique, outSpatialShape);\n    TV_CHECK_CUDA_ERR();\n\n    if (resetGrid) {\n      resetGridKernel<Index, IndexGrid, NDim>\n          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,\n             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);\n      TV_CHECK_CUDA_ERR();\n    }\n    return numAct;\n  }\n};\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    prepareSubMGridKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);\n    TV_CHECK_CUDA_ERR();\n    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,\n                            kernelSize, stride, padding, dilation,\n                            outSpatialShape);\n    TV_CHECK_CUDA_ERR();\n\n    if (resetGrid) {\n      resetGridSubMKernel<Index, IndexGrid, NDim>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,\n                              numActIn);\n      TV_CHECK_CUDA_ERR();\n    }\n    return numActIn;\n  }\n};\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \\\n  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \\\n                                                       int, NDIM>;            \\\n  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \\\n                                                         int, NDIM>;          \\\n  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \\\n                                                         int, NDIM>;          \\\n  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \\\n                                                       int, NDIM>;\n\n#define DECLARE_GPU_INDEX(Index)          \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);\n\nDECLARE_GPU_INDEX(int);\n\n#undef DECLARE_GPU_INDEX\n#undef DECLARE_GPU_SPECS_INDEX_NDIM\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/maxpool.h>\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <type_traits>\n#include <utils/spconv/tensorview/helper_kernel.cuh>\n\n#include \"pytorch_cuda_helper.hpp\"\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,\n                                      const scalar_t *inFeatures,\n                                      const Index *indicesIn,\n                                      const Index *indicesOut, int numHot,\n                                      int numPlanes) {\n  scalar_t in, out;\n  int ILPStrideY[NumILP];\n  Index idxo, idxi;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x; ix < numHot;\n       ix += blockDim.x * gridDim.x) {\n    {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        in = inFeatures[idxi];\n        out = outFeatures[idxo];\n        if (in > out) {\n          outFeatures[idxo] = in;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,\n                                             const scalar_t *inFeatures,\n                                             const Index *indicesIn,\n                                             const Index *indicesOut,\n                                             int numHot, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        in = inFeatures[RI[ilp] + iy];\n        out = outFeatures[RO[ilp] + iy];\n        if (in > out) {\n          outFeatures[RO[ilp] + iy] = in;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,\n                                         const scalar_t *inFeatures,\n                                         const Index *indicesIn,\n                                         const Index *indicesOut, int numHot,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n  scalar_t bufi[vecloadFactor];\n  scalar_t bufo[vecloadFactor];\n  Index idxi, idxo;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;\n       ix += blockDim.x * gridDim.x * vecloadFactor) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(bufo)[0] =\n          reinterpret_cast<VecType *>(outFeatures)[idxo];\n      reinterpret_cast<VecType *>(bufi)[0] =\n          reinterpret_cast<const VecType *>(inFeatures)[idxi];\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        if (bufi[i] > bufo[i]) {\n          bufo[i] = bufi[i];\n        }\n      }\n      reinterpret_cast<VecType *>(outFeatures)[idxo] =\n          reinterpret_cast<VecType *>(bufo)[0];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,\n                                        const scalar_t *inFeatures,\n                                        const Index *indicesIn,\n                                        const Index *indicesOut, int numHot,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < numHot) {\n        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n      }\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < numHot) {\n          in = inFeatures[RI[ilp] + iy];\n          out = outFeatures[RO[ilp] + iy];\n          if (in > out) {\n            outFeatures[RO[ilp] + iy] = in;\n          }\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,\n                                      const scalar_t *inFeatures,\n                                      const scalar_t *fout, scalar_t *fin,\n                                      const Index *indicesIn,\n                                      const Index *indicesOut, int numHot,\n                                      int numPlanes) {\n  scalar_t in, out;\n  Index idxo, idxi;\n  int ILPStrideY[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  fout += blockIdx.y * NumTLP;\n  fin += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x; ix < numHot;\n       ix += blockDim.x * gridDim.x) {\n    {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        in = inFeatures[idxi];\n        out = outFeatures[idxo];\n        if (in == out) {\n          fin[idxi] += fout[idxo];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdGenericBlockKernel(\n    const scalar_t *outFeatures, const scalar_t *inFeatures,\n    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,\n    const Index *indicesOut, int numHot, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        in = inFeatures[RI[ilp] + iy];\n        out = outFeatures[RO[ilp] + iy];\n        if (in == out) {\n          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,\n                                         const scalar_t *inFeatures,\n                                         const scalar_t *fout, scalar_t *fin,\n                                         const Index *indicesIn,\n                                         const Index *indicesOut, int numHot,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n  scalar_t bufi[vecloadFactor];\n  scalar_t bufo[vecloadFactor];\n  scalar_t bufdi[vecloadFactor];\n  scalar_t bufdo[vecloadFactor];\n  Index idxi, idxo;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;\n       ix += blockDim.x * gridDim.x * vecloadFactor) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(bufo)[0] =\n          reinterpret_cast<const VecType *>(outFeatures)[idxo];\n      reinterpret_cast<VecType *>(bufi)[0] =\n          reinterpret_cast<const VecType *>(inFeatures)[idxi];\n      reinterpret_cast<VecType *>(bufdo)[0] =\n          reinterpret_cast<const VecType *>(fout)[idxo];\n      reinterpret_cast<VecType *>(bufdi)[0] =\n          reinterpret_cast<VecType *>(fin)[idxi];\n\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        if (bufi[i] == bufo[i]) {\n          bufdi[i] += bufdo[i];\n        }\n      }\n      reinterpret_cast<VecType *>(fin)[idxi] =\n          reinterpret_cast<VecType *>(bufdi)[0];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,\n                                        const scalar_t *inFeatures,\n                                        const scalar_t *fout, scalar_t *fin,\n                                        const Index *indicesIn,\n                                        const Index *indicesOut, int numHot,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < numHot) {\n        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n      }\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < numHot) {\n          in = inFeatures[RI[ilp] + iy];\n          out = outFeatures[RO[ilp] + iy];\n          if (in == out) {\n            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];\n          }\n        }\n      }\n    }\n  }\n}\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = inFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (numHotBlock >= NumTLP) {\n            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                    indices.subview(0).data(),\n                                    indices.subview(1).data(), numHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_CUDA_ERR();\n          }\n\n          if (size > numHotBlock) {\n            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                       indices.subview(0).data() + numHotBlock,\n                                       indices.subview(1).data() + numHotBlock,\n                                       size - numHotBlock, numPlanes);\n            TV_CHECK_CUDA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (numHotBlock >= NumTLP) {\n        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(),\n                indices.subview(0).data(), indices.subview(1).data(),\n                numHotBlock, numPlanes);\n        TV_CHECK_CUDA_ERR();\n      }\n\n      if (size > numHotBlock) {\n        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(),\n                indices.subview(0).data() + numHotBlock,\n                indices.subview(1).data() + numHotBlock, size - numHotBlock,\n                numPlanes);\n        TV_CHECK_CUDA_ERR();\n      }\n    }\n  }\n};\n\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d,\n                  tv::TensorView<const scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const scalar_t> fout,\n                  tv::TensorView<scalar_t> fin,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = inFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,\n                                 &indices, &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (numHotBlock >= NumTLP) {\n            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                    fout.data(), fin.data(),\n                                    indices.subview(0).data(),\n                                    indices.subview(1).data(), numHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_CUDA_ERR();\n          }\n\n          if (size > numHotBlock) {\n            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                       fout.data(), fin.data(),\n                                       indices.subview(0).data() + numHotBlock,\n                                       indices.subview(1).data() + numHotBlock,\n                                       size - numHotBlock, numPlanes);\n            TV_CHECK_CUDA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (numHotBlock >= NumTLP) {\n        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),\n                indices.subview(0).data(), indices.subview(1).data(),\n                numHotBlock, numPlanes);\n        TV_CHECK_CUDA_ERR();\n      }\n\n      if (size > numHotBlock) {\n        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),\n                indices.subview(0).data() + numHotBlock,\n                indices.subview(1).data() + numHotBlock, size - numHotBlock,\n                numPlanes);\n        TV_CHECK_CUDA_ERR();\n      }\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \\\n  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \\\n                                                       Index>;                 \\\n  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \\\n                                                        scalar_t, Index>;\n\n#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);\n\nDECLARE_GPU_SPECS(float);\nDECLARE_GPU_SPECS(double);\nDECLARE_GPU_SPECS(at::Half);\n\n#undef DECLARE_GPU_SPECS\n#undef DECLARE_GPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu",
    "content": "#include <cuda_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/maxpool.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n\ntorch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,\n                                                     torch::Tensor indicePairs,\n                                                     torch::Tensor indiceNum,\n                                                     int64_t numAct) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  auto device = features.device().type();\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0) {\n      continue;\n    }\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        features.scalar_type(), \"IndiceMaxpoolForwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>\n                forwardFtor;\n            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),\n                        tv::torch2tv<const scalar_t>(features),\n                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n          } else {\n            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>\n                forwardFtor;\n            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                        tv::torch2tv<const scalar_t>(features),\n                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n            TV_CHECK_CUDA_ERR();\n          }\n        });\n  }\n  return output;\n}\n\ntorch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,\n                                                      torch::Tensor outFeatures,\n                                                      torch::Tensor outGrad,\n                                                      torch::Tensor indicePairs,\n                                                      torch::Tensor indiceNum) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  auto device = features.device().type();\n  auto numInPlanes = features.size(1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);\n  auto kernelVolume = indicePairs.size(0);\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0) {\n      continue;\n    }\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        features.scalar_type(), \"IndiceMaxpoolBackwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>\n                backwardFtor;\n            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),\n                         tv::torch2tv<const scalar_t>(features),\n                         tv::torch2tv<const scalar_t>(outGrad),\n                         tv::torch2tv<scalar_t>(inputGrad),\n                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n          } else {\n            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>\n                backwardFtor;\n            backwardFtor(tv::TorchGPU(),\n                         tv::torch2tv<const scalar_t>(outFeatures),\n                         tv::torch2tv<const scalar_t>(features),\n                         tv::torch2tv<const scalar_t>(outGrad),\n                         tv::torch2tv<scalar_t>(inputGrad),\n                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n            TV_CHECK_CUDA_ERR();\n          }\n        });\n  }\n  return inputGrad;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/spconv/reordering.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <spconv/reordering.cuh>\n#include <type_traits>\n#include <utils/spconv/tensorview/helper_kernel.cuh>\n\n#include \"pytorch_cuda_helper.hpp\"\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,\n                  tv::TensorView<const scalar_t> features,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = features.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n      int nHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (nHotBlock >= NumTLP) {\n            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                 vecload_type_t>\n                <<<dim3(numPlanes / NumTLP, size / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(buffer.data(), features.data(),\n                                    indices.data(), nHotBlock,\n                                    numPlanes / vecloadFactor);\n\n            TV_CHECK_CUDA_ERR();\n          }\n          if (size - nHotBlock > 0) {\n            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,\n                            vecload_type_t>\n                <<<dim3(1, numPlanes / NumTLP),\n                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,\n                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,\n                                    features.data(), indices.data() + nHotBlock,\n                                    size - nHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_CUDA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>\n          <<<dim3(tv::launch::DivUp(size, NumTLP),\n                  tv::launch::DivUp(numPlanes, NumTLP)),\n             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n              buffer.data(), features.data(), indices.data(), size, numPlanes);\n      TV_CHECK_CUDA_ERR();\n    }\n  }\n};\ntemplate <typename scalar_t, typename Index>\nstruct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> buffer,\n                  tv::TensorView<const Index> indices, int size, bool stable) {\n    if (size <= 0) return;\n    int numPlanes = outFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor =\n        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.\n    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n      int nHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (nHotBlock >= NumTLP) {\n            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(numPlanes / NumTLP, size / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), buffer.data(),\n                                    indices.data(), nHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_CUDA_ERR();\n          }\n          if (size - nHotBlock > 0) {\n            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(\n                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,\n                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);\n            TV_CHECK_CUDA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>\n          <<<dim3(tv::launch::DivUp(size, NumTLP),\n                  tv::launch::DivUp(numPlanes, NumTLP)),\n             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n              outFeatures.data(), buffer.data(), indices.data(), size,\n              numPlanes);\n      TV_CHECK_CUDA_ERR();\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \\\n  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \\\n  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \\\n                                                   Index>;\n\n#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);\n\nDECLARE_GPU_SPECS(float);\nDECLARE_GPU_SPECS(double);\nDECLARE_GPU_SPECS(at::Half);\n\n#undef DECLARE_GPU_SPECS\n#undef DECLARE_GPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu",
    "content": "#include <cuda_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/reordering.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  at::cuda::CUDAGuard device_guard(indices.device());\n  bool subM = _subM != 0;\n  bool transpose = _transpose != 0;\n  auto numAct = indices.size(0);\n  auto coorDim = indices.size(1) - 1;\n  TV_ASSERT_RT_ERR(NDim == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(stride.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(padding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(dilation.size() == coorDim, \"error\");\n  auto kernelVolume = kernelSize[0];\n  for (int i = 1; i < kernelSize.size(); ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  TV_ASSERT_RT_ERR(kernelVolume <= 4096, \"error\");\n  auto outputVolume = outSpatialShape[0];\n  for (int i = 1; i < outSpatialShape.size(); ++i) {\n    outputVolume *= outSpatialShape[i];\n  }\n  torch::Tensor indicePairs =\n      torch::full({kernelVolume, 2, numAct}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor indiceNum = torch::zeros(\n      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor gridOut =\n      torch::full({batchSize * outputVolume}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  int64_t numActOut = -1;\n  tv::SimpleVector<int, NDim> outSpatialShape32;\n  tv::SimpleVector<int, NDim> kernelSize32;\n  tv::SimpleVector<int, NDim> stride32;\n  tv::SimpleVector<int, NDim> padding32;\n  tv::SimpleVector<int, NDim> dilation32;\n  auto indicePairUnique = torch::full(\n      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),\n      torch::dtype(torch::kInt32).device(indices.device()));\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShape32.push_back(outSpatialShape[i]);\n    kernelSize32.push_back(kernelSize[i]);\n    if (subM) {\n      stride32.push_back(1);\n      padding32.push_back(kernelSize[i] / 2);\n      dilation32.push_back(dilation[i]);\n    } else {\n      stride32.push_back(stride[i]);\n      padding32.push_back(padding[i]);\n      dilation32.push_back(dilation[i]);\n    }\n  }\n  if (subM) {\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n    } else {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n    }\n    return {indices, indicePairs, indiceNum};\n  } else {\n    torch::Tensor outInds =\n        torch::zeros({numAct * kernelVolume, coorDim + 1},\n                     torch::dtype(torch::kInt32).device(indices.device()));\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,\n          transpose);\n    } else {\n      auto getIndicePairFtorP1 =\n          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,\n                                                 NDim>();\n      auto getIndicePairFtorP2 =\n          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,\n                                                 NDim>();\n      numActOut = getIndicePairFtorP1(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,\n          padding32, dilation32, outSpatialShape32, transpose);\n      if (numActOut > 0) {\n        auto res = torch::_unique(indicePairUnique);\n        indicePairUnique = std::get<0>(res);\n        numActOut = getIndicePairFtorP2(\n            tv::TorchGPU(), tv::torch2tv<const int>(indices),\n            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);\n      }\n    }\n    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};\n  }\n}\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  at::cuda::CUDAGuard device_guard(indices.device());\n  bool subM = _subM != 0;\n  bool transpose = _transpose != 0;\n  auto numAct = indices.size(0);\n  auto coorDim = indices.size(1) - 1;\n  TV_ASSERT_RT_ERR(NDim == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(stride.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(padding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(dilation.size() == coorDim, \"error\");\n  auto kernelVolume = kernelSize[0];\n  for (int i = 1; i < kernelSize.size(); ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  TV_ASSERT_RT_ERR(kernelVolume <= 4096, \"error\");\n  auto outputVolume = outSpatialShape[0];\n  for (int i = 1; i < outSpatialShape.size(); ++i) {\n    outputVolume *= outSpatialShape[i];\n  }\n  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, \"error\");\n  torch::Tensor indicePairs =\n      torch::full({kernelVolume, 2, numAct}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor indiceNum = torch::zeros(\n      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));\n  int64_t numActOut = -1;\n  tv::SimpleVector<int, NDim> outSpatialShape32;\n  tv::SimpleVector<int, NDim> kernelSize32;\n  tv::SimpleVector<int, NDim> stride32;\n  tv::SimpleVector<int, NDim> padding32;\n  tv::SimpleVector<int, NDim> dilation32;\n  auto indicePairUnique = torch::full(\n      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),\n      torch::dtype(torch::kInt32).device(indices.device()));\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShape32.push_back(outSpatialShape[i]);\n    kernelSize32.push_back(kernelSize[i]);\n    if (subM) {\n      stride32.push_back(1);\n      padding32.push_back(kernelSize[i] / 2);\n      dilation32.push_back(dilation[i]);\n    } else {\n      stride32.push_back(stride[i]);\n      padding32.push_back(padding[i]);\n      dilation32.push_back(dilation[i]);\n    }\n  }\n  if (subM) {\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n      gridOut.fill_(-1);\n    } else {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose, true);\n    }\n    return {indices, indicePairs, indiceNum};\n  } else {\n    torch::Tensor outInds =\n        torch::zeros({numAct * kernelVolume, coorDim + 1},\n                     torch::dtype(torch::kInt32).device(indices.device()));\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,\n          transpose, true);\n      gridOut.fill_(-1);\n    } else {\n      auto getIndicePairFtorP1 =\n          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,\n                                                 NDim>();\n      auto getIndicePairFtorP2 =\n          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,\n                                                 NDim>();\n      numActOut = getIndicePairFtorP1(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,\n          padding32, dilation32, outSpatialShape32, transpose);\n      if (numActOut > 0) {\n        auto res = torch::_unique(indicePairUnique);\n        indicePairUnique = std::get<0>(res);\n        numActOut = getIndicePairFtorP2(\n            tv::TorchGPU(), tv::torch2tv<const int>(indices),\n            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,\n            true);\n      }\n    }\n    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};\n  }\n}\n\ntorch::Tensor IndiceConvForwardCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,\n    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,\n    int64_t _subM) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n\n  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {\n    torch::mm_out(output, features, filters[indicePairMaxOffset]);\n  }\n  double totalGatherTime = 0;\n  double totalGEMMTime = 0;\n  double totalSAddTime = 0;\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        features.scalar_type(), \"IndiceConvForwardKernel\", [&] {\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_CUDA_ERR();\n            /* slower than SparseGatherFunctor, may due to int->long conversion\n            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);\n            auto indicePairBlob =\n            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},\n            indicePairOptions); torch::index_select_out(inputBufferBlob,\n            features, 0, indicePairBlob);*/\n          }\n          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);\n\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n            TV_CHECK_CUDA_ERR();\n          }\n        });\n  }\n  return output;\n}\n\nstd::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  at::cuda::CUDAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  auto filterShape = filters.sizes();\n  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);\n  torch::Tensor filtersGrad = torch::zeros(filterShape, options);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {\n    auto filterGradSub = filtersGrad[indicePairMaxOffset];\n    torch::mm_out(filterGradSub, features.t(), outGrad);\n    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());\n  }\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        features.scalar_type(), \"IndiceConvBackwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            gatherFtorOut(\n                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),\n                tv::torch2tv<const scalar_t>(outGrad),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),\n                nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtorOut;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_CUDA_ERR();\n            gatherFtorOut(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),\n                tv::torch2tv<const scalar_t>(outGrad),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),\n                nHot);\n            TV_CHECK_CUDA_ERR();\n          }\n          auto filterGradSub = filtersGrad[i];\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);\n          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),\n                tv::torch2tv<const scalar_t>(inputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),\n                tv::torch2tv<const scalar_t>(inputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);\n            TV_CHECK_CUDA_ERR();\n          }\n        });\n  }\n  return {inputGrad, filtersGrad.view(filterShape)};\n}\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"stack_ball_query_cuda_kernel.cuh\"\n#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))\n\nvoid StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,\n                                             const Tensor new_xyz,\n                                             const Tensor new_xyz_batch_cnt,\n                                             const Tensor xyz,\n                                             const Tensor xyz_batch_cnt,\n                                             Tensor idx) {\n  at::cuda::CUDAGuard device_guard(new_xyz.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();\n  //   const float *xyz_ptr = xyz.data_ptr<float>();\n  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();\n  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();\n  //   int *idx_ptr = idx.data_ptr<int>();\n\n  int B = xyz_batch_cnt.size(0);\n  int M = new_xyz.size(0);\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      new_xyz.scalar_type(), \"stack_ball_query_forward_cuda_kernel\", [&] {\n        stack_ball_query_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),\n                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),\n                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"stack_group_points_cuda_kernel.cuh\"\n\nvoid StackGroupPointsForwardCUDAKernelLauncher(\n    int b, int c, int m, int nsample, const Tensor features_tensor,\n    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,\n    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n  at::cuda::CUDAGuard device_guard(features_tensor.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features_tensor.scalar_type(), \"stack_group_points_forward_cuda_kernel\",\n      [&] {\n        stack_group_points_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),\n                features_batch_cnt_tensor.data_ptr<int>(),\n                idx_tensor.data_ptr<int>(),\n                idx_batch_cnt_tensor.data_ptr<int>(),\n                out_tensor.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid StackGroupPointsBackwardCUDAKernelLauncher(\n    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,\n    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,\n    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {\n  at::cuda::CUDAGuard device_guard(grad_features_tensor.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_features_tensor.scalar_type(),\n      \"stack_group_points_backward_cuda_kernel\", [&] {\n        stack_group_points_backward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),\n                idx_tensor.data_ptr<int>(),\n                idx_batch_cnt_tensor.data_ptr<int>(),\n                features_batch_cnt_tensor.data_ptr<int>(),\n                grad_features_tensor.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"sync_bn_cuda_kernel.cuh\"\n\nvoid SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sync_bn_forward_mean_cuda_kernel\", [&] {\n        sync_bn_forward_mean_cuda_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,\n                channels, spatial);\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,\n                                        Tensor var) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sync_bn_forward_mean_cuda_kernel\", [&] {\n        sync_bn_forward_var_cuda_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),\n                var.data_ptr<float>(), num, channels, spatial);\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SyncBNForwardOutputCUDAKernelLauncher(\n    const Tensor input, const Tensor mean, const Tensor var,\n    Tensor running_mean, Tensor running_var, const Tensor weight,\n    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,\n    float momentum, int group_size) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sync_bn_forward_mean_cuda_kernel\", [&] {\n        sync_bn_forward_output_cuda_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),\n                var.data_ptr<float>(), running_mean.data_ptr<float>(),\n                running_var.data_ptr<float>(), weight.data_ptr<float>(),\n                bias.data_ptr<float>(), norm.data_ptr<float>(),\n                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,\n                channels, spatial, eps, momentum, group_size);\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,\n                                           const Tensor norm,\n                                           Tensor grad_weight,\n                                           Tensor grad_bias) {\n  int num = grad_output.size(0);\n  int channels = grad_output.size(1);\n  int spatial = grad_output.size(2);\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"sync_bn_backward_param_cuda_kernel\", [&] {\n        sync_bn_backward_param_cuda_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),\n                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,\n                channels, spatial);\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,\n                                          const Tensor weight,\n                                          const Tensor grad_weight,\n                                          const Tensor grad_bias,\n                                          const Tensor norm, const Tensor std,\n                                          Tensor grad_input) {\n  int output_size = grad_input.numel();\n  int num = grad_input.size(0);\n  int channels = grad_input.size(1);\n  int spatial = grad_input.size(2);\n\n  at::cuda::CUDAGuard device_guard(grad_input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"sync_bn_backward_data_cuda_kernel\", [&] {\n        sync_bn_backward_data_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),\n                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),\n                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,\n                channels, spatial);\n      });\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"three_interpolate_cuda_kernel.cuh\"\n\nvoid ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,\n                                               const Tensor points,\n                                               const Tensor idx,\n                                               const Tensor weight,\n                                               Tensor out) {\n  // points: (B, C, M)\n  // idx: (B, N, 3)\n  // weight: (B, N, 3)\n  // output:\n  //      out: (B, C, N)\n\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"three_interpolate_forward_cuda_kernel\", [&] {\n        three_interpolate_forward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,\n                                                const Tensor grad_out,\n                                                const Tensor idx,\n                                                const Tensor weight,\n                                                Tensor grad_points) {\n  // grad_out: (B, C, N)\n  // weight: (B, N, 3)\n  // output:\n  //      grad_points: (B, C, M)\n\n  at::cuda::CUDAGuard device_guard(grad_out.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"three_interpolate_backward_cuda_kernel\", [&] {\n        three_interpolate_backward_cuda_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"three_nn_cuda_kernel.cuh\"\n\nvoid ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,\n                                      const Tensor known, Tensor dist2,\n                                      Tensor idx) {\n  // unknown: (B, N, 3)\n  // known: (B, M, 3)\n  // output:\n  //      dist2: (B, N, 3)\n  //      idx: (B, N, 3)\n\n  at::cuda::CUDAGuard device_guard(unknown.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      unknown.scalar_type(), \"three_nn_forward_cuda_kernel\", [&] {\n        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),\n            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cuda_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#include \"tin_shift_cuda_kernel.cuh\"\n\nvoid TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,\n                                       Tensor output) {\n  int output_size = output.numel();\n  int batch_size = input.size(0);\n  int t_size = input.size(1);\n  int channels = input.size(2);\n  int hw_size = input.size(3);\n  int group_size = shift.size(1);\n  int group_channel = channels / group_size;\n  int num_kernels = batch_size * hw_size * channels;\n\n  at::cuda::CUDAGuard device_guard(input.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"tin_shift_forward_cuda_kernel\", [&] {\n        tin_shift_forward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),\n                output.data_ptr<scalar_t>(), batch_size, channels, t_size,\n                hw_size, group_size, group_channel);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n\nvoid TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,\n                                        Tensor grad_input) {\n  int output_size = grad_output.numel();\n  int batch_size = grad_output.size(0);\n  int t_size = grad_output.size(1);\n  int channels = grad_output.size(2);\n  int hw_size = grad_output.size(3);\n  int group_size = shift.size(1);\n  int group_channel = channels / group_size;\n  int num_kernels = batch_size * hw_size * channels;\n\n  at::cuda::CUDAGuard device_guard(grad_output.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"tin_shift_backward_cuda_kernel\", [&] {\n        tin_shift_backward_cuda_kernel<scalar_t>\n            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),\n                batch_size, channels, t_size, hw_size, group_size,\n                group_channel);\n      });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n#include <c10/util/Half.h>\n#include <torch/types.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n\nstruct upfirdn2d_kernel_params {\n  const void *x;\n  const float *f;\n  void *y;\n\n  int2 up;\n  int2 down;\n  int2 pad0;\n  int flip;\n  float gain;\n\n  int4 inSize;  // [width, height, channel, batch]\n  int4 inStride;\n  int2 filterSize;  // [width, height]\n  int2 filterStride;\n  int4 outSize;  // [width, height, channel, batch]\n  int4 outStride;\n  int sizeMinor;\n  int sizeMajor;\n\n  int loopMinor;\n  int loopMajor;\n  int loopX;\n  int launchMinor;\n  int launchMajor;\n};\n\n//------------------------------------------------------------------------\n// CUDA kernel specialization.\n\nstruct upfirdn2d_kernel_spec {\n  void *kernel;\n  int tileOutW;\n  int tileOutH;\n  int loopMinor;\n  int loopX;\n};\n\n//------------------------------------------------------------------------\n// CUDA kernel selection.\n\ntemplate <class T>\nupfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);\n//------------------------------------------------------------------------\n\n// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n//------------------------------------------------------------------------\n// Helpers.\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n};\n\nstatic __device__ __forceinline__ int floor_div(int a, int b) {\n  int t = 1 - a / b;\n  return (a + t * b) / b - t;\n}\n\n//------------------------------------------------------------------------\n// Generic CUDA implementation for large filters.\n\ntemplate <class T>\nstatic __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n\n  // Calculate thread index.\n  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;\n  int outY = minorBase / p.launchMinor;\n  minorBase -= outY * p.launchMinor;\n  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;\n  int majorBase = blockIdx.z * p.loopMajor;\n  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)\n    return;\n\n  // Setup Y receptive field.\n  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;\n  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);\n  int h =\n      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;\n  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;\n  if (p.flip) filterY = p.filterSize.y - 1 - filterY;\n\n  // Loop over major, minor, and X.\n  for (int majorIdx = 0, major = majorBase;\n       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)\n    for (int minorIdx = 0, minor = minorBase;\n         minorIdx < p.loopMinor & minor < p.sizeMinor;\n         minorIdx++, minor += p.launchMinor) {\n      int nc = major * p.sizeMinor + minor;\n      int n = nc / p.inSize.z;\n      int c = nc - n * p.inSize.z;\n      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;\n           loopX++, outX += blockDim.y) {\n        // Setup X receptive field.\n        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;\n        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);\n        int w =\n            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -\n            inX;\n        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;\n        if (p.flip) filterX = p.filterSize.x - 1 - filterX;\n\n        // Initialize pointers.\n        const T *xp =\n            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +\n                              c * p.inStride.z + n * p.inStride.w];\n        const float *fp =\n            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];\n        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;\n        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;\n\n        // Inner loop.\n        scalar_t v = 0;\n        for (int y = 0; y < h; y++) {\n          for (int x = 0; x < w; x++) {\n            v += (scalar_t)(*xp) * (scalar_t)(*fp);\n            xp += p.inStride.x;\n            fp += filterStepX;\n          }\n          xp += p.inStride.y - w * p.inStride.x;\n          fp += filterStepY - w * filterStepX;\n        }\n\n        // Store result.\n        v *= p.gain;\n        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +\n                   c * p.outStride.z + n * p.outStride.w] = (T)v;\n      }\n    }\n}\n\n//------------------------------------------------------------------------\n// Specialized CUDA implementation for small filters.\n\ntemplate <class T, int upx, int upy, int downx, int downy, int filterW,\n          int filterH, int tileOutW, int tileOutH, int loopMinor>\nstatic __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;\n  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;\n  __shared__ volatile scalar_t sf[filterH][filterW];\n  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];\n\n  // Calculate tile index.\n  int minorBase = blockIdx.x;\n  int tileOutY = minorBase / p.launchMinor;\n  minorBase -= tileOutY * p.launchMinor;\n  minorBase *= loopMinor;\n  tileOutY *= tileOutH;\n  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;\n  int majorBase = blockIdx.z * p.loopMajor;\n  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |\n      majorBase >= p.sizeMajor)\n    return;\n\n  // Load filter (flipped).\n  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;\n       tapIdx += blockDim.x) {\n    int fy = tapIdx / filterW;\n    int fx = tapIdx - fy * filterW;\n    scalar_t v = 0;\n    if (fx < p.filterSize.x & fy < p.filterSize.y) {\n      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;\n      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;\n      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];\n    }\n    sf[fy][fx] = v;\n  }\n\n  // Loop over major and X.\n  for (int majorIdx = 0, major = majorBase;\n       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {\n    int baseNC = major * p.sizeMinor + minorBase;\n    int n = baseNC / p.inSize.z;\n    int baseC = baseNC - n * p.inSize.z;\n    for (int loopX = 0, tileOutX = tileOutXBase;\n         loopX < p.loopX & tileOutX < p.outSize.x;\n         loopX++, tileOutX += tileOutW) {\n      // Load input pixels.\n      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;\n      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;\n      int tileInX = floor_div(tileMidX, upx);\n      int tileInY = floor_div(tileMidY, upy);\n      __syncthreads();\n      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;\n           inIdx += blockDim.x) {\n        int relC = inIdx;\n        int relInX = relC / loopMinor;\n        int relInY = relInX / tileInW;\n        relC -= relInX * loopMinor;\n        relInX -= relInY * tileInW;\n        int c = baseC + relC;\n        int inX = tileInX + relInX;\n        int inY = tileInY + relInY;\n        scalar_t v = 0;\n        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &\n            c < p.inSize.z)\n          v = (scalar_t)((const T *)\n                             p.x)[inX * p.inStride.x + inY * p.inStride.y +\n                                  c * p.inStride.z + n * p.inStride.w];\n        sx[relInY][relInX][relC] = v;\n      }\n\n      // Loop over output pixels.\n      __syncthreads();\n      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;\n           outIdx += blockDim.x) {\n        int relC = outIdx;\n        int relOutX = relC / loopMinor;\n        int relOutY = relOutX / tileOutW;\n        relC -= relOutX * loopMinor;\n        relOutX -= relOutY * tileOutW;\n        int c = baseC + relC;\n        int outX = tileOutX + relOutX;\n        int outY = tileOutY + relOutY;\n\n        // Setup receptive field.\n        int midX = tileMidX + relOutX * downx;\n        int midY = tileMidY + relOutY * downy;\n        int inX = floor_div(midX, upx);\n        int inY = floor_div(midY, upy);\n        int relInX = inX - tileInX;\n        int relInY = inY - tileInY;\n        int filterX = (inX + 1) * upx - midX - 1;  // flipped\n        int filterY = (inY + 1) * upy - midY - 1;  // flipped\n\n        // Inner loop.\n        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {\n          scalar_t v = 0;\n#pragma unroll\n          for (int y = 0; y < filterH / upy; y++)\n#pragma unroll\n            for (int x = 0; x < filterW / upx; x++)\n              v += sx[relInY + y][relInX + x][relC] *\n                   sf[filterY + y * upy][filterX + x * upx];\n          v *= p.gain;\n          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +\n                     c * p.outStride.z + n * p.outStride.w] = (T)v;\n        }\n      }\n    }\n  }\n}\n\n//------------------------------------------------------------------------\n// CUDA kernel selection.\n\ntemplate <class T>\nupfirdn2d_kernel_spec choose_upfirdn2d_kernel(\n    const upfirdn2d_kernel_params &p) {\n  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;\n  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,\n                                4};  // contiguous\n  if (s == 1)\n    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last\n\n  // No up/downsampling.\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 7 && fy <= 7)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 5 && fy <= 5)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 3 && fy <= 3)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 7 && fy <= 7)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 5 && fy <= 5)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 3 && fy <= 3)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 2x upsampling.\n  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,\n              64, 16, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,\n              16, 16, 8, 1};\n  }\n  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 2x downsampling.\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,\n              8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,\n              16, 16, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,\n              16, 16, 1, 1};\n    if (s == 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,\n              8, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,\n              64, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,\n              64, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,\n              8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,\n              64, 1, 8, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,\n              64, 1, 8, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,\n              1, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,\n              32, 16, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,\n              64, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,\n              64, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,\n              64, 8, 1};\n  }\n\n  // 4x upsampling.\n  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,\n              64, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 32 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,\n              32, 32, 1, 1};\n  }\n  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 4x downsampling (inefficient).\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,\n              32, 8, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,\n              32, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,\n              32, 1, 8, 1};\n    if (s == 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,\n              32, 1, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,\n              32, 8, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,\n              32, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,\n              32, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,\n              32, 8, 1};\n  }\n  return spec;\n}\n\n//------------------------------------------------------------------------\n// Template specializations.\n\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(\n    const upfirdn2d_kernel_params &p);\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(\n    const upfirdn2d_kernel_params &p);\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(\n    const upfirdn2d_kernel_params &p);\n\n//------------------------------------------------------------------------\n\n//------------------------------------------------------------------------\n\ntorch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,\n                           int downx, int downy, int padx0, int padx1,\n                           int pady0, int pady1, bool flip, float gain) {\n  // Validate arguments.\n  TORCH_CHECK(x.is_cuda(), \"x must reside on CUDA device\");\n  TORCH_CHECK(f.device() == x.device(),\n              \"f must reside on the same device as x\");\n  TORCH_CHECK(f.dtype() == torch::kFloat, \"f must be float32\");\n  TORCH_CHECK(x.numel() <= INT_MAX, \"x is too large\");\n  TORCH_CHECK(f.numel() <= INT_MAX, \"f is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x has zero size\");\n  TORCH_CHECK(f.numel() > 0, \"f has zero size\");\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(f.dim() == 2, \"f must be rank 2\");\n  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +\n                      (x.size(2) - 1) * x.stride(2) +\n                      (x.size(3) - 1) * x.stride(3) <=\n                  INT_MAX,\n              \"x memory footprint is too large\");\n  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, \"f must be at least 1x1\");\n  TORCH_CHECK(upx >= 1 && upy >= 1, \"upsampling factor must be at least 1\");\n  TORCH_CHECK(downx >= 1 && downy >= 1,\n              \"downsampling factor must be at least 1\");\n\n  // Create output tensor.\n  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));\n  int outW =\n      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;\n  int outH =\n      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;\n  TORCH_CHECK(outW >= 1 && outH >= 1, \"output must be at least 1x1\");\n  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},\n                                 x.options(), x.suggest_memory_format());\n  TORCH_CHECK(y.numel() <= INT_MAX, \"output is too large\");\n  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +\n                      (y.size(2) - 1) * y.stride(2) +\n                      (y.size(3) - 1) * y.stride(3) <=\n                  INT_MAX,\n              \"output memory footprint is too large\");\n\n  // Initialize CUDA kernel parameters.\n  upfirdn2d_kernel_params p;\n  p.x = x.data_ptr();\n  p.f = f.data_ptr<float>();\n  p.y = y.data_ptr();\n  p.up = make_int2(upx, upy);\n  p.down = make_int2(downx, downy);\n  p.pad0 = make_int2(padx0, pady0);\n  p.flip = (flip) ? 1 : 0;\n  p.gain = gain;\n  p.inSize =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),\n                         (int)x.stride(0));\n  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));\n  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));\n  p.outSize =\n      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));\n  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),\n                          (int)y.stride(0));\n  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;\n  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;\n\n  // Choose CUDA kernel.\n  upfirdn2d_kernel_spec spec;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), \"upfirdn2d_cuda\", [&] {\n    spec = choose_upfirdn2d_kernel<scalar_t>(p);\n  });\n\n  // Set looping options.\n  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;\n  p.loopMinor = spec.loopMinor;\n  p.loopX = spec.loopX;\n  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;\n  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;\n\n  // Compute grid size.\n  dim3 blockSize, gridSize;\n  if (spec.tileOutW < 0)  // large\n  {\n    blockSize = dim3(4, 32, 1);\n    gridSize =\n        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,\n             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);\n  } else  // small\n  {\n    blockSize = dim3(256, 1, 1);\n    gridSize =\n        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,\n             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);\n  }\n\n  // Launch CUDA kernel.\n  void *args[] = {&p};\n#ifdef MMCV_WITH_HIP\n  AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,\n                                at::cuda::getCurrentCUDAStream()));\n#else\n  AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,\n                                 at::cuda::getCurrentCUDAStream()));\n#endif\n\n  return y;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#include \"voxelization_cuda_kernel.cuh\"\n\nint HardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  // current version tooks about 0.04s for one frame on cpu\n  // check device\n\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  // map points to voxel coors\n  at::Tensor temp_coors =\n      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));\n\n  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));\n  dim3 block(512);\n\n  // 1. link point to corresponding voxel coors\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"hard_voxelize_kernel\", ([&] {\n        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(\n            points.contiguous().data_ptr<scalar_t>(),\n            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,\n            NDim);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  // 2. map point to the idx of the corresponding voxel, find duplicate coor\n  // create some temporary variables\n  auto point_to_pointidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n  auto point_to_voxelidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n\n  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));\n  dim3 map_block(512);\n\n  AT_DISPATCH_ALL_TYPES(\n      temp_coors.scalar_type(), \"determin_duplicate\", ([&] {\n        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(\n            temp_coors.contiguous().data_ptr<int>(),\n            point_to_voxelidx.contiguous().data_ptr<int>(),\n            point_to_pointidx.contiguous().data_ptr<int>(), max_points,\n            max_voxels, num_points, NDim);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  // 3. determine voxel num and voxel's coor index\n  // make the logic in the CUDA device could accelerate about 10 times\n  auto coor_to_voxelidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n  auto voxel_num = at::zeros(\n      {\n          1,\n      },\n      points.options().dtype(at::kInt));  // must be zero from the beginning\n\n  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), \"determin_duplicate\", ([&] {\n                          determin_voxel_num<int><<<1, 1, 0, stream>>>(\n                              num_points_per_voxel.contiguous().data_ptr<int>(),\n                              point_to_voxelidx.contiguous().data_ptr<int>(),\n                              point_to_pointidx.contiguous().data_ptr<int>(),\n                              coor_to_voxelidx.contiguous().data_ptr<int>(),\n                              voxel_num.contiguous().data_ptr<int>(),\n                              max_points, max_voxels, num_points);\n                        }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  // 4. copy point features to voxels\n  // Step 4 & 5 could be parallel\n  auto pts_output_size = num_points * num_features;\n  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));\n  dim3 cp_block(512);\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(\n            pts_output_size, points.contiguous().data_ptr<float>(),\n            point_to_voxelidx.contiguous().data_ptr<int>(),\n            coor_to_voxelidx.contiguous().data_ptr<int>(),\n            voxels.contiguous().data_ptr<float>(), max_points, num_features,\n            num_points, NDim);\n      }));\n  //   cudaDeviceSynchronize();\n  //   AT_CUDA_CHECK(cudaGetLastError());\n\n  // 5. copy coors of each voxels\n  auto coors_output_size = num_points * NDim;\n  dim3 coors_cp_grid(\n      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));\n  dim3 coors_cp_block(512);\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        assign_voxel_coors<float, int>\n            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(\n                coors_output_size, temp_coors.contiguous().data_ptr<int>(),\n                point_to_voxelidx.contiguous().data_ptr<int>(),\n                coor_to_voxelidx.contiguous().data_ptr<int>(),\n                coors.contiguous().data_ptr<int>(), num_points, NDim);\n      }));\n\n  AT_CUDA_CHECK(cudaGetLastError());\n\n  auto voxel_num_cpu = voxel_num.to(at::kCPU);\n  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];\n\n  return voxel_num_int;\n}\n\nint NondeterministicHardVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  if (num_points == 0) return 0;\n\n  dim3 blocks(\n      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  // map points to voxel coors\n  at::Tensor temp_coors =\n      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));\n\n  // 1. link point to corresponding voxel coors\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"hard_voxelize_kernel\", ([&] {\n        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(\n            points.contiguous().data_ptr<scalar_t>(),\n            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,\n            NDim);\n      }));\n\n  at::Tensor coors_map;\n  at::Tensor reduce_count;\n\n  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);\n\n  std::tie(temp_coors, coors_map, reduce_count) =\n      at::unique_dim(coors_clean, 0, true, true, false);\n\n  if (temp_coors[0][0].lt(0).item<bool>()) {\n    // the first element of temp_coors is (-1,-1,-1) and should be removed\n    temp_coors = temp_coors.slice(0, 1);\n    coors_map = coors_map - 1;\n  }\n\n  int num_coors = temp_coors.size(0);\n  temp_coors = temp_coors.to(at::kInt);\n  coors_map = coors_map.to(at::kInt);\n\n  at::Tensor coors_count = at::zeros({1}, coors_map.options());\n  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());\n  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());\n  reduce_count = at::zeros({num_coors}, coors_map.options());\n\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"get_assign_pos\", ([&] {\n        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(\n            num_points, coors_map.contiguous().data_ptr<int32_t>(),\n            pts_id.contiguous().data_ptr<int32_t>(),\n            coors_count.contiguous().data_ptr<int32_t>(),\n            reduce_count.contiguous().data_ptr<int32_t>(),\n            coors_order.contiguous().data_ptr<int32_t>());\n      }));\n\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        nondeterministic_assign_point_voxel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                num_points, points.contiguous().data_ptr<scalar_t>(),\n                coors_map.contiguous().data_ptr<int32_t>(),\n                pts_id.contiguous().data_ptr<int32_t>(),\n                temp_coors.contiguous().data_ptr<int32_t>(),\n                reduce_count.contiguous().data_ptr<int32_t>(),\n                coors_order.contiguous().data_ptr<int32_t>(),\n                voxels.contiguous().data_ptr<scalar_t>(),\n                coors.contiguous().data_ptr<int32_t>(),\n                num_points_per_voxel.contiguous().data_ptr<int32_t>(),\n                max_voxels, max_points, num_features, NDim);\n      }));\n  AT_CUDA_CHECK(cudaGetLastError());\n  return max_voxels < num_coors ? max_voxels : num_coors;\n}\n\nvoid DynamicVoxelizeForwardCUDAKernelLauncher(\n    const at::Tensor &points, at::Tensor &coors,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const int NDim = 3) {\n  // current version tooks about 0.04s for one frame on cpu\n  // check device\n\n  at::cuda::CUDAGuard device_guard(points.device());\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);\n  dim3 blocks(col_blocks);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_ALL_TYPES(points.scalar_type(), \"dynamic_voxelize_kernel\", [&] {\n    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(\n        points.contiguous().data_ptr<scalar_t>(),\n        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);\n  });\n\n  AT_CUDA_CHECK(cudaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col) {\n  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,\n                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,\n                       stride_w, dilation_h, dilation_w, parallel_imgs,\n                       deformable_group, data_col);\n}\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im) {\n  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,\n                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,\n                       stride_w, dilation_h, dilation_w, parallel_imgs,\n                       deformable_group, grad_im);\n}\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset) {\n  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,\n                       data_offset, channels, height, width, ksize_h, ksize_w,\n                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,\n                       parallel_imgs, deformable_group, grad_offset);\n}\n\nvoid deform_conv_shape_check(at::Tensor input, at::Tensor offset,\n                             at::Tensor *gradOutput, at::Tensor weight, int kH,\n                             int kW, int dH, int dW, int padH, int padW,\n                             int dilationH, int dilationW, int group,\n                             int deformable_group) {\n  TORCH_CHECK(\n      weight.ndimension() == 4,\n      \"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s\",\n      weight.ndimension());\n\n  TORCH_CHECK(weight.is_contiguous(), \"weight tensor has to be contiguous\");\n\n  TORCH_CHECK(kW > 0 && kH > 0,\n              \"kernel size should be greater than zero, but got kH: %d kW: %d\",\n              kH, kW);\n\n  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),\n              \"kernel size should be consistent with weight, \",\n              \"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d\",\n              kH, kW, weight.size(2), weight.size(3));\n\n  TORCH_CHECK(dW > 0 && dH > 0,\n              \"stride should be greater than zero, but got dH: %d dW: %d\", dH,\n              dW);\n\n  TORCH_CHECK(\n      dilationW > 0 && dilationH > 0,\n      \"dilation should be greater than 0, but got dilationH: %d dilationW: %d\",\n      dilationH, dilationW);\n\n  int ndim = input.ndimension();\n  int dimf = 0;\n  int dimh = 1;\n  int dimw = 2;\n\n  if (ndim == 4) {\n    dimf++;\n    dimh++;\n    dimw++;\n  }\n\n  TORCH_CHECK(ndim == 3 || ndim == 4,\n              \"3D or 4D input tensor expected but got: %s\", ndim);\n\n  long nInputPlane = weight.size(1) * group;\n  long inputHeight = input.size(dimh);\n  long inputWidth = input.size(dimw);\n  long nOutputPlane = weight.size(0);\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n\n  TORCH_CHECK(nInputPlane % deformable_group == 0,\n              \"input channels must divide deformable group size\");\n\n  if (outputWidth < 1 || outputHeight < 1)\n    AT_ERROR(\n        \"Given input size: (%ld x %ld x %ld). \"\n        \"Calculated output size: (%ld x %ld x %ld). Output size is too small\",\n        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,\n        outputWidth);\n\n  TORCH_CHECK(input.size(1) == nInputPlane,\n              \"invalid number of input planes, expected: %d, but got: %d\",\n              nInputPlane, input.size(1));\n\n  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),\n              \"input image is smaller than kernel\");\n\n  TORCH_CHECK(\n      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),\n      \"invalid spatial size of offset, expected height: %d width: %d, but \"\n      \"got height: %d width: %d\",\n      outputHeight, outputWidth, offset.size(2), offset.size(3));\n\n  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),\n              \"invalid number of channels of offset\");\n\n  if (gradOutput != NULL) {\n    TORCH_CHECK(\n        gradOutput->size(dimf) == nOutputPlane,\n        \"invalid number of gradOutput planes, expected: %d, but got: %d\",\n        nOutputPlane, gradOutput->size(dimf));\n\n    TORCH_CHECK(\n        (gradOutput->size(dimh) == outputHeight &&\n         gradOutput->size(dimw) == outputWidth),\n        \"invalid size of gradOutput, expected height: %d width: %d , but \"\n        \"got height: %d width: %d\",\n        outputHeight, outputWidth, gradOutput->size(dimh),\n        gradOutput->size(dimw));\n  }\n}\n\nvoid deform_conv_forward(Tensor input, Tensor weight, Tensor offset,\n                         Tensor output, Tensor columns, Tensor ones, int kW,\n                         int kH, int dW, int dH, int padW, int padH,\n                         int dilationW, int dilationH, int group,\n                         int deformable_group, int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(weight);\n    CHECK_CUDA_INPUT(output);\n    CHECK_CUDA_INPUT(columns);\n    CHECK_CUDA_INPUT(ones);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  }\n#ifndef MMCV_WITH_MUSA\n  else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(weight);\n    CHECK_CPU_INPUT(output);\n    CHECK_CPU_INPUT(columns);\n    CHECK_CPU_INPUT(ones);\n  }\n#endif\n  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,\n                          padW, dilationH, dilationW, group, deformable_group);\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input.unsqueeze_(0);\n    offset.unsqueeze_(0);\n  }\n\n  // todo: assert batchsize dividable by im2col_step\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = weight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), \"invalid batch size of offset\");\n\n  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,\n                        outputHeight, outputWidth});\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {\n    ones = at::ones({outputHeight, outputWidth}, input.options());\n  }\n\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,\n                                    im2col_step * outputHeight, outputWidth},\n                                   output.options());\n\n  output_buffer = output_buffer.view(\n      {output_buffer.size(0), group, output_buffer.size(1) / group,\n       output_buffer.size(2), output_buffer.size(3)});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group, columns);\n\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      output_buffer[elt][g] = output_buffer[elt][g]\n                                  .flatten(1)\n                                  .addmm_(weight[g].flatten(1), columns[g])\n                                  .view_as(output_buffer[elt][g]);\n    }\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n  }\n\n  output_buffer = output_buffer.view(\n      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),\n       output_buffer.size(3), output_buffer.size(4)});\n\n  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,\n                                      im2col_step, outputHeight, outputWidth});\n  output_buffer.transpose_(1, 2);\n  output.copy_(output_buffer);\n  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    output = output.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});\n  }\n}\n\nvoid deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,\n                                Tensor gradInput, Tensor gradOffset,\n                                Tensor weight, Tensor columns, int kW, int kH,\n                                int dW, int dH, int padW, int padH,\n                                int dilationW, int dilationH, int group,\n                                int deformable_group, int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(gradOutput);\n    CHECK_CUDA_INPUT(gradInput);\n    CHECK_CUDA_INPUT(gradOffset);\n    CHECK_CUDA_INPUT(weight);\n    CHECK_CUDA_INPUT(columns);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  }\n#ifndef MMCV_WITH_MUSA\n  else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(gradOutput);\n    CHECK_CPU_INPUT(gradInput);\n    CHECK_CPU_INPUT(gradOffset);\n    CHECK_CPU_INPUT(weight);\n    CHECK_CPU_INPUT(columns);\n  }\n#endif\n  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,\n                          padH, padW, dilationH, dilationW, group,\n                          deformable_group);\n\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input = input.view({1, input.size(0), input.size(1), input.size(2)});\n    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});\n    gradOutput = gradOutput.view(\n        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});\n  }\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = weight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), 3, \"invalid batch size of offset\");\n  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  // change order of grad output\n  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,\n                                nOutputPlane, outputHeight, outputWidth});\n  gradOutput.transpose_(1, 2);\n\n  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                              inputHeight, inputWidth});\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,\n                                deformable_group * 2 * kH * kW, outputHeight,\n                                outputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    // divide into groups\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n    gradOutput = gradOutput.view(\n        {gradOutput.size(0), group, gradOutput.size(1) / group,\n         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});\n\n    for (int g = 0; g < group; g++) {\n      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),\n                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    gradOutput = gradOutput.view(\n        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),\n         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});\n\n    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,\n                                 inputHeight, inputWidth, kH, kW, padH, padW,\n                                 dH, dW, dilationH, dilationW, im2col_step,\n                                 deformable_group, gradOffset[elt]);\n\n    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group,\n                           gradInput[elt]);\n\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n  }\n\n  gradOutput.transpose_(1, 2);\n  gradOutput =\n      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  gradOffset = gradOffset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});\n    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});\n    gradOffset =\n        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});\n  }\n}\n\nvoid deform_conv_backward_parameters(Tensor input, Tensor offset,\n                                     Tensor gradOutput, Tensor gradWeight,\n                                     Tensor columns, Tensor ones, int kW,\n                                     int kH, int dW, int dH, int padW, int padH,\n                                     int dilationW, int dilationH, int group,\n                                     int deformable_group, float scale,\n                                     int im2col_step) {\n  if (input.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(input);\n    CHECK_CUDA_INPUT(offset);\n    CHECK_CUDA_INPUT(gradOutput);\n    CHECK_CUDA_INPUT(gradWeight);\n    CHECK_CUDA_INPUT(columns);\n    CHECK_CUDA_INPUT(ones);\n#else\n    AT_ERROR(\"DeformConv is not compiled with GPU support\");\n#endif\n  }\n#ifndef MMCV_WITH_MUSA\n  else {\n    CHECK_CPU_INPUT(input);\n    CHECK_CPU_INPUT(offset);\n    CHECK_CPU_INPUT(gradOutput);\n    CHECK_CPU_INPUT(gradWeight);\n    CHECK_CPU_INPUT(columns);\n    CHECK_CPU_INPUT(ones);\n  }\n#endif\n\n  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,\n                          dW, padH, padW, dilationH, dilationW, group,\n                          deformable_group);\n  at::DeviceGuard guard(input.device());\n\n  int batch = 1;\n\n  if (input.ndimension() == 3) {\n    // Force batch\n    batch = 0;\n    input = input.view(\n        at::IntList({1, input.size(0), input.size(1), input.size(2)}));\n    gradOutput = gradOutput.view(\n        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});\n  }\n\n  long batchSize = input.size(0);\n  long nInputPlane = input.size(1);\n  long inputHeight = input.size(2);\n  long inputWidth = input.size(3);\n\n  long nOutputPlane = gradWeight.size(0);\n\n  long outputWidth =\n      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;\n  long outputHeight =\n      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;\n\n  TORCH_CHECK((offset.size(0) == batchSize), \"invalid batch size of offset\");\n\n  columns = at::zeros(\n      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},\n      input.options());\n\n  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,\n                                nOutputPlane, outputHeight, outputWidth});\n  gradOutput.transpose_(1, 2);\n\n  Tensor gradOutputBuffer = at::zeros_like(gradOutput);\n  gradOutputBuffer =\n      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,\n                             outputHeight, outputWidth});\n  gradOutputBuffer = gradOutputBuffer.contiguous();\n  gradOutputBuffer.copy_(gradOutput);\n  gradOutputBuffer =\n      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,\n                             im2col_step * outputHeight, outputWidth});\n\n  gradOutput.transpose_(1, 2);\n  gradOutput =\n      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});\n\n  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,\n                      inputHeight, inputWidth});\n  offset =\n      offset.view({batchSize / im2col_step, im2col_step,\n                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  for (int elt = 0; elt < batchSize / im2col_step; elt++) {\n    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,\n                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,\n                           dilationW, im2col_step, deformable_group, columns);\n\n    // divide into group\n    gradOutputBuffer = gradOutputBuffer.view(\n        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,\n         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    gradWeight =\n        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),\n                         gradWeight.size(2), gradWeight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      gradWeight[g] = gradWeight[g]\n                          .flatten(1)\n                          .addmm_(gradOutputBuffer[elt][g].flatten(1),\n                                  columns[g].transpose(1, 0), 1.0, scale)\n                          .view_as(gradWeight[g]);\n    }\n    gradOutputBuffer = gradOutputBuffer.view(\n        {gradOutputBuffer.size(0),\n         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),\n         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),\n                                  gradWeight.size(2), gradWeight.size(3),\n                                  gradWeight.size(4)});\n  }\n\n  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});\n  offset = offset.view(\n      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});\n\n  if (batch == 0) {\n    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});\n    input = input.view({nInputPlane, inputHeight, inputWidth});\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/deform_roi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,\n                       output, pooled_height, pooled_width, spatial_scale,\n                       sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma) {\n  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,\n                       offset, grad_input, grad_offset, pooled_height,\n                       pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,\n                             Tensor output, int pooled_height, int pooled_width,\n                             float spatial_scale, int sampling_ratio,\n                             float gamma) {\n  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,\n                               pooled_width, spatial_scale, sampling_ratio,\n                               gamma);\n}\n\nvoid deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,\n                              Tensor offset, Tensor grad_input,\n                              Tensor grad_offset, int pooled_height,\n                              int pooled_width, float spatial_scale,\n                              int sampling_ratio, float gamma) {\n  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,\n                                grad_offset, pooled_height, pooled_width,\n                                spatial_scale, sampling_ratio, gamma);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid) {\n  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,\n                              vertices, mask, num_valid);\n}\n\nTensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,\n                                              Tensor num_valid) {\n  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/filtered_lrelu.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns) {\n  return DISPATCH_DEVICE_IMPL(filtered_lrelu_op_impl, x, fu, fd, b, si, up,\n                              down, px0, px1, py0, py1, sx, sy, gain, slope,\n                              clamp, flip_filters, writeSigns);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns) {\n  return filtered_lrelu_op_impl(x, fu, fd, b, si, up, down, px0, px1, py0, py1,\n                                sx, sy, gain, slope, clamp, flip_filters,\n                                writeSigns);\n}\n\ntorch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,\n                                         int sx, int sy, float gain,\n                                         float slope, float clamp,\n                                         bool writeSigns) {\n  return DISPATCH_DEVICE_IMPL(filtered_lrelu_act_op_impl, x, si, sx, sy, gain,\n                              slope, clamp, writeSigns);\n}\n\ntorch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,\n                                  int sy, float gain, float slope, float clamp,\n                                  bool writeSigns) {\n  return filtered_lrelu_act_op_impl(x, si, sx, sy, gain, slope, clamp,\n                                    writeSigns);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/focal_loss.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n#include <torch/csrc/utils/pybind.h>\n\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n#include \"csrc_dipu/runtime/device/deviceapis.h\"\n#include \"csrc_dipu/utils/helpfunc.hpp\"\n\nusing dipu::VENDOR_TYPE;\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,\n                       output, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,\n                       grad_input, gamma, alpha);\n}\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,\n                       output, gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha) {\n  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,\n                       buff, grad_input, gamma, alpha);\n}\n\n#ifdef MMCV_WITH_DIOPI\nvoid sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,\n                                      Tensor weight, Tensor output, float gamma,\n                                      float alpha) {\n  auto input_p = toDiopiTensorHandle(input);\n  diopiDevice_t device;\n  diopiGetTensorDevice(input_p, &device);\n  if (device == diopi_host) {\n    sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,\n                                    alpha);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto target_p = toDiopiTensorHandle(target);\n  auto weight_p = toDiopiTensorHandle(weight);\n  auto output_p = toDiopiTensorHandle(output);\n  if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,\n                                           weight_p, gamma, alpha);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,\n                                           weight_p, gamma, alpha);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING)\n      << \"Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl\";\n  auto input_cpu = input.cpu();\n  auto target_cpu = target.cpu();\n  auto weight_cpu = weight.cpu();\n  auto output_cpu = output.cpu();\n  sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,\n                                  gamma, alpha);\n  output.copy_(output_cpu);\n  return;\n}\n\nvoid sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,\n                                       Tensor weight, Tensor grad_input,\n                                       float gamma, float alpha) {\n  auto input_p = toDiopiTensorHandle(input);\n  diopiDevice_t device;\n  diopiGetTensorDevice(input_p, &device);\n  if (device == diopi_host) {\n    sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,\n                                     alpha);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto target_p = toDiopiTensorHandle(target);\n  auto weight_p = toDiopiTensorHandle(weight);\n  auto grad_input_p = toDiopiTensorHandle(grad_input);\n  if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiSigmoidFocalLossBackwardMmcv(\n          ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiSigmoidFocalLossBackwardMmcv(\n          ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING)\n      << \"Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl\";\n  auto input_cpu = input.cpu();\n  auto target_cpu = target.cpu();\n  auto weight_cpu = weight.cpu();\n  auto grad_input_cpu = grad_input.cpu();\n  sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,\n                                   grad_input_cpu, gamma, alpha);\n  grad_input.copy_(grad_input_cpu);\n  return;\n}\n#endif\n\nvoid sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha) {\n#ifdef MMCV_WITH_DIOPI\n  sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);\n#else\n  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);\n#endif\n}\n\nvoid sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor grad_input, float gamma, float alpha) {\n#ifdef MMCV_WITH_DIOPI\n  sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,\n                                    alpha);\n#else\n  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,\n                                   alpha);\n#endif\n}\n\nvoid softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha) {\n  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor buff, Tensor grad_input, float gamma,\n                                 float alpha) {\n  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,\n                                   gamma, alpha);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/furthest_point_sample.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m) {\n  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,\n                       temp_tensor, idx_tensor, b, n, m);\n}\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m) {\n  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,\n                       points_tensor, temp_tensor, idx_tensor, b, n, m);\n}\n\nvoid furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,\n                                     Tensor idx_tensor, int b, int n, int m) {\n  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,\n                                       b, n, m);\n}\n\nvoid furthest_point_sampling_with_dist_forward(Tensor points_tensor,\n                                               Tensor temp_tensor,\n                                               Tensor idx_tensor, int b, int n,\n                                               int m) {\n  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,\n                                                 idx_tensor, b, n, m);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp\n\n/*\nCopyright (c) 2021, NVIDIA Corporation. All rights reserved.\n\nNVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator\nAugmentation (ADA)\n=======================================================================\n\n1. Definitions\n\n\"Licensor\" means any person or entity that distributes its Work.\n\n\"Software\" means the original work of authorship made available under\nthis License.\n\n\"Work\" means the Software and any additions to or derivative works of\nthe Software that are made available under this License.\n\nThe terms \"reproduce,\" \"reproduction,\" \"derivative works,\" and\n\"distribution\" have the meaning as provided under U.S. copyright law;\nprovided, however, that for the purposes of this License, derivative\nworks shall not include works that remain separable from, or merely\nlink (or bind by name) to the interfaces of, the Work.\n\nWorks, including the Software, are \"made available\" under this License\nby including in or with the Work either (a) a copyright notice\nreferencing the applicability of this License to the Work, or (b) a\ncopy of this License.\n\n2. License Grants\n\n    2.1 Copyright Grant. Subject to the terms and conditions of this\n    License, each Licensor grants to you a perpetual, worldwide,\n    non-exclusive, royalty-free, copyright license to reproduce,\n    prepare derivative works of, publicly display, publicly perform,\n    sublicense and distribute its Work and any resulting derivative\n    works in any form.\n\n3. Limitations\n\n    3.1 Redistribution. You may reproduce or distribute the Work only\n    if (a) you do so under this License, (b) you include a complete\n    copy of this License with your distribution, and (c) you retain\n    without modification any copyright, patent, trademark, or\n    attribution notices that are present in the Work.\n\n    3.2 Derivative Works. You may specify that additional or different\n    terms apply to the use, reproduction, and distribution of your\n    derivative works of the Work (\"Your Terms\") only if (a) Your Terms\n    provide that the use limitation in Section 3.3 applies to your\n    derivative works, and (b) you identify the specific derivative\n    works that are subject to Your Terms. Notwithstanding Your Terms,\n    this License (including the redistribution requirements in Section\n    3.1) will continue to apply to the Work itself.\n\n    3.3 Use Limitation. The Work and any derivative works thereof only\n    may be used or intended for use non-commercially. Notwithstanding\n    the foregoing, NVIDIA and its affiliates may use the Work and any\n    derivative works commercially. As used herein, \"non-commercially\"\n    means for research or evaluation purposes only.\n\n    3.4 Patent Claims. If you bring or threaten to bring a patent claim\n    against any Licensor (including any claim, cross-claim or\n    counterclaim in a lawsuit) to enforce any patents that you allege\n    are infringed by any Work, then your rights under this License from\n    such Licensor (including the grant in Section 2.1) will terminate\n    immediately.\n\n    3.5 Trademarks. This License does not grant any rights to use any\n    Licensor’s or its affiliates’ names, logos, or trademarks, except\n    as necessary to reproduce the notices described in this License.\n\n    3.6 Termination. If you violate any term of this License, then your\n    rights under this License (including the grant in Section 2.1) will\n    terminate immediately.\n\n4. Disclaimer of Warranty.\n\nTHE WORK IS PROVIDED \"AS IS\" WITHOUT WARRANTIES OR CONDITIONS OF ANY\nKIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR\nNON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER\nTHIS LICENSE.\n\n5. Limitation of Liability.\n\nEXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL\nTHEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE\nSHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,\nINDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF\nOR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK\n(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,\nLOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER\nCOMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF\nTHE POSSIBILITY OF SUCH DAMAGES.\n\n=======================================================================\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,\n                                           const torch::Tensor& bias,\n                                           const torch::Tensor& refer, int act,\n                                           int grad, float alpha, float scale) {\n  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,\n                              act, grad, alpha, scale);\n}\n\ntorch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,\n                                   const torch::Tensor& bias,\n                                   const torch::Tensor& refer, int act,\n                                   int grad, float alpha, float scale) {\n  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,\n                                      scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor fused_indice_conv_batchnorm_forward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,\n                              features, filters, bias, indicePairs, indiceNum,\n                              numActOut, _inverse, _subM);\n}\n\ntorch::Tensor fused_indice_conv_batchnorm_forward(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,\n                                                  indicePairs, indiceNum,\n                                                  numActOut, _inverse, _subM);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/gather_points.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out) {\n  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,\n                       idx, out);\n}\n\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,\n                       idx, grad_points);\n}\n\nvoid gather_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                           Tensor out_tensor, int b, int c, int n,\n                           int npoints) {\n  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,\n                             out_tensor);\n}\n\nvoid gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                            Tensor grad_points_tensor, int b, int c, int n,\n                            int npoints) {\n  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,\n                              grad_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/group_points.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,\n                       points, idx, out);\n}\n\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,\n                       grad_out, idx, grad_points);\n}\n\nvoid group_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                          Tensor out_tensor, int b, int c, int n, int npoints,\n                          int nsample) {\n  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,\n                       points_tensor, idx_tensor, out_tensor);\n}\n\nvoid group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                           Tensor grad_points_tensor, int b, int c, int n,\n                           int npoints, int nsample) {\n  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,\n                             idx_tensor, grad_points_tensor);\n}\n\nvoid stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,\n                                      const Tensor grad_out_tensor,\n                                      const Tensor idx_tensor,\n                                      const Tensor idx_batch_cnt_tensor,\n                                      const Tensor features_batch_cnt_tensor,\n                                      Tensor grad_features_tensor) {\n  DISPATCH_DEVICE_IMPL(stack_group_points_backward_impl, b, c, m, n, nsample,\n                       grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,\n                       features_batch_cnt_tensor, grad_features_tensor);\n}\n\nvoid stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                 Tensor idx_batch_cnt_tensor,\n                                 Tensor features_batch_cnt_tensor,\n                                 Tensor grad_features_tensor, int b, int c,\n                                 int m, int n, int nsample) {\n  stack_group_points_backward_impl(\n      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,\n      features_batch_cnt_tensor, grad_features_tensor);\n}\n\nvoid stack_group_points_forward_impl(int b, int c, int m, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor) {\n  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,\n                       features_tensor, features_batch_cnt_tensor, idx_tensor,\n                       idx_batch_cnt_tensor, out_tensor);\n}\n\nvoid stack_group_points_forward(Tensor features_tensor,\n                                Tensor features_batch_cnt_tensor,\n                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,\n                                Tensor out_tensor, int b, int c, int m,\n                                int nsample) {\n  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,\n                       features_tensor, features_batch_cnt_tensor, idx_tensor,\n                       idx_batch_cnt_tensor, out_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/info.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp\n#include \"pytorch_cpp_helper.hpp\"\n\n#ifdef MMCV_WITH_CUDA\n#ifdef MMCV_WITH_HIP\n#include <hip/hip_runtime_api.h>\nint get_hiprt_version() {\n  int runtimeVersion;\n  hipRuntimeGetVersion(&runtimeVersion);\n  return runtimeVersion;\n}\n#else\n#include <cuda_runtime_api.h>\nint get_cudart_version() { return CUDART_VERSION; }\n#endif\n#endif\n\nstd::string get_compiling_cuda_version() {\n#ifdef MMCV_WITH_CUDA\n#ifndef MMCV_WITH_HIP\n  std::ostringstream oss;\n  // copied from\n  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231\n  auto printCudaStyleVersion = [&](int v) {\n    oss << (v / 1000) << \".\" << (v / 10 % 100);\n    if (v % 10 != 0) {\n      oss << \".\" << (v % 10);\n    }\n  };\n  printCudaStyleVersion(get_cudart_version());\n  return oss.str();\n#else\n  std::ostringstream oss;\n  oss << get_hiprt_version();\n  return oss.str();\n#endif\n#else\n  return std::string(\"not available\");\n#endif\n}\n\n// similar to\n// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp\nstd::string get_compiler_version() {\n  std::ostringstream ss;\n#if defined(__GNUC__)\n#ifndef __clang__\n  { ss << \"GCC \" << __GNUC__ << \".\" << __GNUC_MINOR__; }\n#endif\n#endif\n\n#if defined(__clang_major__)\n  {\n    ss << \"clang \" << __clang_major__ << \".\" << __clang_minor__ << \".\"\n       << __clang_patchlevel__;\n  }\n#endif\n\n#if defined(_MSC_VER)\n  { ss << \"MSVC \" << _MSC_FULL_VER; }\n#endif\n  return ss.str();\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/iou3d.cpp",
    "content": "// Modified from\n// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp\n\n/*\n3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)\nWritten by Shaoshuai Shi\nAll Rights Reserved 2019-2020.\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nconst int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;\n\nvoid iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap) {\n  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,\n                       num_b, boxes_b, ans_overlap);\n}\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh) {\n  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,\n                       nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh) {\n  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,\n                       nms_overlap_thresh);\n}\n\nvoid iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,\n                                     Tensor ans_overlap) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params boxes_b: (M, 5)\n  // params ans_overlap: (N, M)\n  int num_a = boxes_a.size(0);\n  int num_b = boxes_b.size(0);\n\n  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,\n                                       ans_overlap);\n}\n\nvoid iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                         float nms_overlap_thresh) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params keep: (N)\n  CHECK_CONTIGUOUS(boxes);\n  CHECK_CONTIGUOUS(keep);\n\n  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                                float nms_overlap_thresh) {\n  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]\n  // params keep: (N)\n\n  CHECK_CONTIGUOUS(boxes);\n  CHECK_CONTIGUOUS(keep);\n\n  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/knn.cpp",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,\n                       dist2);\n}\n\nvoid knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,\n                 Tensor dist2_tensor, int b, int n, int m, int nsample) {\n  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,\n                   dist2_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/masked_conv2d.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w) {\n  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,\n                       col, kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels) {\n  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,\n                       im, height, width, channels);\n}\n\nvoid masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor col,\n                           const int kernel_h, const int kernel_w,\n                           const int pad_h, const int pad_w) {\n  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,\n                             kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor im, int height,\n                           int width, int channels) {\n  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,\n                             channels);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/min_area_polygons.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {\n  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);\n}\n\nvoid min_area_polygons(const Tensor pointsets, Tensor polygons) {\n  min_area_polygons_impl(pointsets, polygons);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid ball_query_forward_mlu(int b, int n, int m, float min_radius,\n                            float max_radius, int nsample, const Tensor new_xyz,\n                            const Tensor xyz, Tensor idx) {\n  auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      new_xyz, new_xyz.suggest_memory_format());\n  auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      xyz, new_xyz.suggest_memory_format());\n  auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      idx, new_xyz.suggest_memory_format());\n\n  MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc;\n  new_xyz_desc.set(new_xyz_contiguous);\n  xyz_desc.set(xyz_contiguous);\n  idx_desc.set(idx_contiguous);\n\n  auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous);\n  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);\n  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);\n  auto new_xyz_ptr = new_xyz_impl->cnnlMalloc();\n  auto xyz_ptr = xyz_impl->cnnlMalloc();\n  auto idx_ptr = idx_impl->cnnlMalloc();\n\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpBallQuery(\n      handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), xyz_ptr,\n      min_radius, max_radius, nsample, idx_desc.desc(), idx_ptr));\n}\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx);\n\nREGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n\n#include \"mlu_common_helper.h\"\n\nvoid bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                       const int32_t mode, const bool aligned,\n                       const int32_t offset) {\n  // check dtype\n  TORCH_CHECK(\n      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,\n      \"Data type of input should be Float or Half. But now input type is \",\n      bboxes1.scalar_type(), \".\");\n  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),\n              \"bboxes1's dtype should be the same with bboxes2's dtype.\");\n\n  // params check\n  TORCH_CHECK(bboxes1.dim() == 2, \"bboxes1 should be a 2d tensor, got \",\n              bboxes1.dim(), \"D\");\n  TORCH_CHECK(bboxes2.dim() == 2, \"bboxes2 should be a 2d tensor, got \",\n              bboxes2.dim(), \"D\");\n\n  auto rows = bboxes1.size(0);\n  auto cols = bboxes2.size(0);\n  auto batch_num_all = rows;\n\n  if (rows * cols == 0) {\n    // return if zero element\n    return;\n  }\n\n  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);\n  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);\n  INITIAL_MLU_PARAM_WITH_TENSOR(ious);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  TORCH_MLUOP_CHECK(mluOpBboxOverlaps(\n      handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,\n      bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));\n}\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\n\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,\n                                    Tensor ious, const int mode_flag,\n                                    const bool aligned) {\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      boxes1, boxes1.suggest_memory_format());\n  auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      boxes2, boxes2.suggest_memory_format());\n  auto ious_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());\n\n  MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;\n  boxes1_desc.set(boxes1_contiguous);\n  boxes2_desc.set(boxes2_contiguous);\n  ious_desc.set(ious_contiguous);\n\n  auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);\n  auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);\n  auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);\n\n  auto boxes1_ptr = boxes1_impl->cnnlMalloc();\n  auto boxes2_ptr = boxes2_impl->cnnlMalloc();\n  auto ious_ptr = ious_impl->cnnlMalloc();\n\n  CNLOG(INFO) << \"Call mluOpBoxIouRotated().\";\n  TORCH_MLUOP_CHECK(mluOpBoxIouRotated(\n      handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,\n      boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(), ious_ptr));\n}\n\nvoid box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);\n}\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n\nREGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,\n                                    Tensor rinput, Tensor routput, Tensor rmask,\n                                    Tensor output, const int kernel_size,\n                                    const int group_size,\n                                    const int scale_factor) {\n  // check tensor data type\n  TORCH_CHECK(\n      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,\n      \"Data type of input should be Float or Half. But now input type is \",\n      input.scalar_type(), \".\");\n\n  TORCH_CHECK(mask.scalar_type() == input.scalar_type(),\n              \"Data types of input and mask should be the same, but got \",\n              input.scalar_type(), \" and \", mask.scalar_type());\n\n  // check number of dimensions\n  TORCH_CHECK(input.dim() == 4, \"input should be a 4-D tensor, but has \",\n              input.dim(), \"D.\");\n  TORCH_CHECK(mask.dim() == 4, \"mask should be a 4-D tensor, but has \",\n              input.dim(), \"D.\");\n\n  // return fast on zero-element tensor\n  if (output.numel() == 0) {\n    output = at::zeros(output.sizes().vec(), output.options());\n    return;\n  }\n\n  // convert NCHW to NHWC\n  auto memory_format_input_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto rinput_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format_input_nhwc);\n\n  auto memory_format_mask_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());\n  auto rmask_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);\n\n  auto memory_format_output_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(output.dim());\n  auto routput_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);\n\n  // set tensor descriptor\n  MluOpTensorDescriptor input_desc, mask_desc, output_desc;\n  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);\n  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);\n  output_desc.set_with_layout(routput_, MLUOP_LAYOUT_NHWC);\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);\n  auto mask_ptr = mask_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(routput_);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // set op descriptor\n  auto handle = mluOpGetCurrentHandle();\n  mluOpCarafeDescriptor_t carafe_desc;\n  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));\n  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(\n      carafe_desc, input.dim(), kernel_size, group_size, scale_factor));\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpCarafeForward(handle, carafe_desc, input_desc.desc(),\n                                       input_ptr, mask_desc.desc(), mask_ptr,\n                                       output_desc.desc(), output_ptr));\n  // destroy op descriptor\n  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));\n\n  // copy output from NHWC back into NCHW\n  rinput.copy_(rinput_);\n  output.copy_(routput_);\n}\n\nvoid CARAFEBackwardMLUKernelLauncher(\n    const Tensor grad_output, const Tensor rinput, const Tensor mask,\n    Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,\n    Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,\n    const int kernel_size, const int group_size, const int scale_factor) {\n  // data type check\n  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||\n                  grad_output.scalar_type() == at::kHalf,\n              \"grad_output type should be Float or Half, got \",\n              grad_output.scalar_type());\n  TORCH_CHECK(grad_output.scalar_type() == mask.scalar_type(),\n              \"mask should have the same type as grad_output\");\n\n  // dim check\n  TORCH_CHECK(grad_output.dim() == 4, \"grad_output should be a 4d tensor, got \",\n              grad_output.dim(), \"D\");\n\n  // param check\n  TORCH_CHECK(kernel_size < 137, \"kernel_size should be less than 137, got \",\n              kernel_size);\n\n  // convert NCHW to NHWC\n  auto memory_format_input_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());\n  auto rinput_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rinput, memory_format_input_nhwc);\n\n  auto memory_format_mask_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());\n  auto rmask_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);\n\n  auto memory_format_grad_output_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());\n  auto rgrad_output_ = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_output, memory_format_grad_output_nhwc);\n\n  auto memory_format_grad_input_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_input.dim());\n  auto rgrad_input_ = torch_mlu::cnnl::ops::cnnl_contiguous(\n                          grad_input, memory_format_grad_input_nhwc)\n                          .zero_();\n\n  auto memory_format_grad_mask_nhwc =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_mask.dim());\n  auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_mask, memory_format_grad_mask_nhwc);\n\n  // set tensor descriptor\n  MluOpTensorDescriptor input_desc, mask_desc;\n  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);\n  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);\n\n  MluOpTensorDescriptor grad_output_desc, grad_input_desc, grad_mask_desc;\n  grad_output_desc.set_with_layout(rgrad_output_, MLUOP_LAYOUT_NHWC);\n  grad_input_desc.set_with_layout(rgrad_input_, MLUOP_LAYOUT_NHWC);\n  grad_mask_desc.set_with_layout(rgrad_mask_, MLUOP_LAYOUT_NHWC);\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);\n  auto mask_ptr = mask_impl->cnnlMalloc();\n  auto grad_output_impl = torch_mlu::getMluTensorImpl(rgrad_output_);\n  auto grad_output_ptr = grad_output_impl->cnnlMalloc();\n  auto grad_input_impl = torch_mlu::getMluTensorImpl(rgrad_input_);\n  auto grad_input_ptr = grad_input_impl->cnnlMalloc();\n  auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);\n  auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();\n\n  // set op descriptor\n  auto handle = mluOpGetCurrentHandle();\n  mluOpCarafeDescriptor_t carafe_desc;\n  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));\n  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(\n      carafe_desc, grad_output.dim(), kernel_size, group_size, scale_factor));\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpCarafeBackward(\n      handle, carafe_desc, input_desc.desc(), input_ptr, mask_desc.desc(),\n      mask_ptr, grad_output_desc.desc(), grad_output_ptr,\n      grad_input_desc.desc(), grad_input_ptr, grad_mask_desc.desc(),\n      grad_mask_ptr));\n  // destroy op descriptor\n  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));\n\n  // copy output from NHWC back into NCHW\n  grad_input.copy_(rgrad_input_);\n  grad_mask.copy_(rgrad_mask_);\n}\n\nvoid carafe_forward_mlu(Tensor features, Tensor masks, Tensor rfeatures,\n                        Tensor routput, Tensor rmasks, Tensor output,\n                        int kernel_size, int group_size, int scale_factor) {\n  CARAFEForwardMLUKernelLauncher(features, masks, rfeatures, routput, rmasks,\n                                 output, kernel_size, group_size, scale_factor);\n}\n\nvoid carafe_backward_mlu(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                         Tensor rtop_grad, Tensor rbottom_grad_hs,\n                         Tensor rbottom_grad, Tensor rmask_grad,\n                         Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                         int group_size, int scale_factor) {\n  CARAFEBackwardMLUKernelLauncher(top_grad, rfeatures, masks, rtop_grad,\n                                  rbottom_grad_hs, rbottom_grad, rmask_grad,\n                                  bottom_grad, mask_grad, kernel_size,\n                                  group_size, scale_factor);\n}\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_forward_impl, MLU, carafe_forward_mlu);\nREGISTER_DEVICE_IMPL(carafe_backward_impl, MLU, carafe_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,\n                                           Tensor offset, Tensor output,\n                                           int pooled_height, int pooled_width,\n                                           float spatial_scale,\n                                           int sampling_ratio, float gamma) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);\n  auto rois_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());\n  auto output_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);\n\n  MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;\n  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);\n  rois_desc.set(rois_contiguous);\n  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);\n\n  mluOpTensorDescriptor_t offset_real_desc = NULL;\n  void *offset_ptr = NULL;\n  if (offset.defined() && offset.numel() > 0) {\n    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n        offset, offset.suggest_memory_format());\n    offset_desc.set(offset_contiguous);\n    offset_real_desc = offset_desc.desc();\n    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);\n    offset_ptr = offset_impl->cnnlMalloc();\n  }\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(input_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolForward(\n      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,\n      offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,\n      sampling_ratio, gamma, output_desc.desc(), output_ptr));\n\n  output.copy_(output_contiguous);\n}\n\nvoid DeformRoIPoolBackwardMLUKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());\n  auto grad_output_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);\n  memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);\n  auto rois_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());\n  auto grad_input_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);\n\n  // get ptr of tensors\n  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);\n  auto grad_output_ptr = grad_output_impl->cnnlMalloc();\n  auto input_impl = torch_mlu::getMluTensorImpl(input_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);\n  auto grad_input_ptr = grad_input_impl->cnnlMalloc();\n\n  MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,\n      grad_input_desc, grad_offset_desc;\n  grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);\n  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);\n  rois_desc.set(rois_contiguous);\n  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);\n  mluOpTensorDescriptor_t offset_real_desc = NULL;\n  void *offset_ptr = NULL;\n  if (offset.defined() && offset.numel() > 0) {\n    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n        offset, offset.suggest_memory_format());\n    offset_desc.set(offset_contiguous);\n    offset_real_desc = offset_desc.desc();\n    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);\n    offset_ptr = offset_impl->cnnlMalloc();\n  }\n  mluOpTensorDescriptor_t grad_offset_real_desc = NULL;\n  void *grad_offset_ptr = NULL;\n  if (grad_offset.defined() && grad_offset.numel() > 0) {\n    auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n        grad_offset, grad_offset.suggest_memory_format());\n    grad_offset_desc.set(grad_offset_contiguous);\n    grad_offset_real_desc = grad_offset_desc.desc();\n    auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);\n    grad_offset_ptr = grad_offset_impl->cnnlMalloc();\n  }\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolBackward(\n      handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),\n      input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,\n      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,\n      grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,\n      grad_offset_ptr));\n  grad_input.copy_(grad_input_);\n}\n\nvoid deform_roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor offset,\n                                 Tensor output, int pooled_height,\n                                 int pooled_width, float spatial_scale,\n                                 int sampling_ratio, float gamma) {\n  DeformRoIPoolForwardMLUKernelLauncher(input, rois, offset, output,\n                                        pooled_height, pooled_width,\n                                        spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_mlu(Tensor grad_output, Tensor input, Tensor rois,\n                                  Tensor offset, Tensor grad_input,\n                                  Tensor grad_offset, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DeformRoIPoolBackwardMLUKernelLauncher(\n      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,\n      pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n\nREGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MLU,\n                     deform_roi_pool_forward_mlu);\nREGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MLU,\n                     deform_roi_pool_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2023 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nTensor diff_iou_rotated_sort_vertices_forward_mlu(Tensor vertices, Tensor mask,\n                                                  Tensor num_valid) {\n  // params check\n  TORCH_CHECK(vertices.scalar_type() == at::kFloat,\n              \"vertices type should be Float, got \", vertices.scalar_type());\n  TORCH_CHECK(mask.scalar_type() == at::kBool, \"mask should be Bool, got \",\n              mask.scalar_type());\n  TORCH_CHECK(num_valid.scalar_type() == at::kInt,\n              \"num_valid type should be Int32, got \", num_valid.scalar_type());\n  TORCH_CHECK(vertices.size(2) == 24, \"vertices.dim(2) should be 24, got \",\n              vertices.size(2));\n  TORCH_CHECK(mask.size(2) == 24, \"mask.dim(2) should be 24, got \",\n              mask.size(2));\n\n  // zero-element check\n  if (vertices.numel() == 0) {\n    return at::empty({0}, num_valid.options().dtype(at::kInt));\n  }\n\n  auto idx = at::empty({vertices.size(0), vertices.size(1), 9},\n                       num_valid.options().dtype(at::kInt));\n\n  INITIAL_MLU_PARAM_WITH_TENSOR(vertices);\n  INITIAL_MLU_PARAM_WITH_TENSOR(mask);\n  INITIAL_MLU_PARAM_WITH_TENSOR(num_valid);\n  INITIAL_MLU_PARAM_WITH_TENSOR(idx);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpDiffIouRotatedSortVerticesForward(\n      handle, vertices_desc.desc(), vertices_ptr, mask_desc.desc(), mask_ptr,\n      num_valid_desc.desc(), num_valid_ptr, idx_desc.desc(), idx_ptr));\n  return idx;\n}\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\nREGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MLU,\n                     diff_iou_rotated_sort_vertices_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include <string>\n#include <vector>\n\n#include \"mlu_common_helper.h\"\n\nvoid sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,\n                                    Tensor output, const float gamma,\n                                    const float alpha) {\n  // params check\n  TORCH_CHECK(gamma >= 0, \"gamma should be greater than or equal to 0. \",\n              \"But now gamma is \", gamma, \".\");\n\n  // check dtype\n  TORCH_CHECK(\n      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,\n      \"Data type of input should be Float or Half. But now input type is \",\n      input.scalar_type(), \".\");\n\n  TORCH_CHECK(\n      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),\n      \"target type should be Int or Long. \", \"But now target type is \",\n      target.scalar_type(), \".\");\n\n  if (weight.data_ptr() != nullptr) {\n    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),\n                \"Data types of input and weight should be the same. But now \"\n                \"input type is \",\n                input.scalar_type(), \", weight type is \", weight.scalar_type(),\n                \".\");\n  } else {\n    CNLOG(INFO) << \"weight is a empty tensor.\";\n  }\n\n  // return if zero-element\n  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {\n    return;\n  }\n\n  // contiguous\n  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      input, input.suggest_memory_format());\n  // target only support in32\n  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      target.toType(at::kInt), target.suggest_memory_format());\n  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      weight, weight.suggest_memory_format());\n  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      output, output.suggest_memory_format());\n\n  // set tensor descriptor\n  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;\n  input_desc.set(input_contiguous);\n  target_desc.set(target_contiguous);\n  weight_desc.set(weight_contiguous);\n  output_desc.set(output_contiguous);\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);\n  auto target_ptr = target_impl->cnnlMalloc();\n  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);\n  auto weight_ptr = weight_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // set prefer computation performance and redcuntion approach\n  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_FAST;\n  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;\n\n  auto handle = mluOpGetCurrentHandle();\n\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidForward(\n      handle, prefer, reduction, input_desc.desc(), input_ptr,\n      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,\n      gamma, output_desc.desc(), output_ptr));\n}\n\nvoid sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, const float gamma,\n                                     const float alpha) {\n  // params check\n  TORCH_CHECK(gamma >= 0, \"gamma should be greater than or equal to 0. \",\n              \"But now gamma is \", gamma, \".\");\n  // check dtype\n  TORCH_CHECK(\n      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,\n      \"Data type of input should be Float or Half. But now input type is \",\n      input.scalar_type(), \".\");\n\n  TORCH_CHECK(\n      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),\n      \"target type should be Int or Long. \", \"But now target type is \",\n      target.scalar_type(), \".\");\n\n  bool has_weight = false;\n  if (weight.data_ptr() != nullptr) {\n    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),\n                \"Data types of input and weight should be the same. But now \"\n                \"input type is \",\n                input.scalar_type(), \", weight type is \", weight.scalar_type(),\n                \".\");\n    has_weight = true;\n  } else {\n    CNLOG(INFO) << \"weight is a empty tensor.\";\n  }\n\n  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {\n    // return if zero-element\n    return;\n  }\n\n  // contiguous\n  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      input, input.suggest_memory_format());\n  // only support in32\n  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      target.toType(at::kInt), target.suggest_memory_format());\n  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      weight, weight.suggest_memory_format());\n  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      output, output.suggest_memory_format());\n\n  // set tensor descriptor\n  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;\n  input_desc.set(input_contiguous);\n  target_desc.set(target_contiguous);\n  weight_desc.set(weight_contiguous);\n  output_desc.set(output_contiguous);\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);\n  auto target_ptr = target_impl->cnnlMalloc();\n  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);\n  auto weight_ptr = weight_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // set prefer computation performance and redcuntion approach\n  // backward only support MLUOP_COMPUTATION_HIGH_PRECISION\n  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_HIGH_PRECISION;\n  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;\n\n  auto handle = mluOpGetCurrentHandle();\n\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidBackward(\n      handle, prefer, reduction, input_desc.desc(), input_ptr,\n      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,\n      gamma, output_desc.desc(), output_ptr));\n}\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,\n                     sigmoid_focal_loss_forward_mlu);\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,\n                     sigmoid_focal_loss_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n\n#include \"mlu_common_helper.h\"\n\nvoid IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,\n                                 float iou_threshold) {\n  if (boxes.numel() == 0) {\n    return;\n  }\n\n  int input_box_num = boxes.size(0);\n  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);\n  auto output = keep.to(boxes.options().dtype(at::kInt));\n  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));\n\n  MluOpTensorDescriptor boxes_desc, output_desc;\n  boxes_desc.set(boxes_);\n  output_desc.set(output);\n\n  // workspace\n  size_t workspace_size = 0;\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL,\n                                             &workspace_size));\n  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));\n\n  // get compute queue\n  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);\n  auto boxes_ptr = boxes_impl->cnnlMalloc();\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(keep);\n  auto output_ptr = output_impl->cnnlMalloc();\n  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);\n  auto output_size_ptr = output_size_impl->cnnlMalloc();\n\n  // nms desc\n  mluOpNmsDescriptor_t nms_desc;\n  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;\n  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;\n  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;\n  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;\n  const float soft_nms_sigma = 0.0;\n  const float confidence_threshold = 0.0;\n  const int input_layout = 0;\n  const bool pad_to_max_output_size = false;\n  const int max_output_size = input_box_num;\n  const float offset = 0.0;\n\n  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));\n  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(\n      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,\n      soft_nms_sigma, max_output_size, confidence_threshold, offset,\n      input_layout, pad_to_max_output_size));\n\n  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,\n                             NULL, NULL, workspace_ptr, workspace_size,\n                             output_desc.desc(), output_ptr, output_size_ptr));\n  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));\n}\n\nvoid iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,\n                             float nms_overlap_thresh) {\n  IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);\n}\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh);\nREGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"pytorch_device_registry.hpp\"\n#include \"pytorch_mlu_helper.hpp\"\n\nvoid KernelMaskedIm2colForward(\n    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,\n    cnrtDataType_t k_dtype, const void *im_ptr, const int height,\n    const int width, const int channels, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,\n    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr);\n\nvoid KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                               cnrtQueue_t queue, cnrtDataType_t k_dtype,\n                               const void *col_ptr, const int height,\n                               const int width, const int channels,\n                               const void *mask_h_idx_ptr,\n                               const void *mask_w_idx_ptr, const int mask_cnt,\n                               void *im_ptr);\n\n// policy function\nstatic void policyFunc(const int mask_cnt, cnrtDim3_t *k_dim,\n                       cnrtFunctionType_t *k_type) {\n  const size_t cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);\n  const size_t core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);\n  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_num);\n  k_dim->x = core_num;\n  k_dim->y =\n      (task_dim / core_num) > cluster_num ? cluster_num : (task_dim / core_num);\n  k_dim->z = 1;\n  *k_type = CNRT_FUNC_TYPE_UNION1;\n}\n\nvoid MaskedIm2colForwardMLUKernelLauncher(const Tensor im,\n                                          const Tensor mask_h_idx,\n                                          const Tensor mask_w_idx, Tensor col,\n                                          const int kernel_h,\n                                          const int kernel_w, const int pad_h,\n                                          const int pad_w) {\n  // Check dtype.\n  TORCH_CHECK(im.scalar_type() == at::kFloat || im.scalar_type() == at::kHalf,\n              \"im type should be Float or Half, got \", im.scalar_type(), \".\");\n  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||\n                  mask_h_idx.scalar_type() == at::kLong,\n              \"mask_h_idx type should be Int or Long, got \",\n              mask_h_idx.scalar_type(), \".\");\n  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||\n                  mask_w_idx.scalar_type() == at::kLong,\n              \"mask_w_idx type should be Int or Long, got \",\n              mask_w_idx.scalar_type(), \".\");\n  TORCH_CHECK(kernel_h > 0, \"kernel_h should greater than 0, got \", kernel_h,\n              \".\");\n  TORCH_CHECK(kernel_w > 0, \"kernel_w should greater than 0, got \", kernel_w,\n              \".\");\n\n  // zero element check\n  TORCH_CHECK(im.numel() > 0, \"im.numel should greater than zero, got \",\n              im.numel(), \".\");\n  TORCH_CHECK(col.size(0) > 0, \"col.size(0) should greater than zero, got \",\n              col.size(0), \".\");\n\n  // large tensor check\n  const size_t max_input_num = 2147483648;  // 2^31, 2G num\n  TORCH_CHECK(im.numel() < max_input_num,\n              \"im.numel() should be less than 2147483648, got \", im.numel(),\n              \".\");\n  TORCH_CHECK(col.numel() < max_input_num,\n              \"col.numel() should be less than 2147483648, got \", col.numel(),\n              \".\");\n\n  const int channels = im.size(1);\n  const int height = im.size(2);\n  const int width = im.size(3);\n  const int mask_cnt = mask_h_idx.size(0);\n\n  // auto im_t = im.permute({0, 2, 3, 1}).contiguous();\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());\n  auto im_ = torch_mlu::cnnl::ops::cnnl_contiguous(im, memory_format);\n  auto col_ =\n      at::zeros({mask_cnt, kernel_h * kernel_w, channels}, col.options());\n  // calculate task dimension\n  cnrtDim3_t k_dim;\n  cnrtFunctionType_t k_type;\n  policyFunc(mask_cnt, &k_dim, &k_type);\n\n  // get compute queue\n  auto queue = torch_mlu::getCurQueue();\n  // get ptr of tensors\n  auto im_impl = torch_mlu::getMluTensorImpl(im_);\n  auto im_ptr = im_impl->cnnlMalloc();\n  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);\n  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();\n  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);\n  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();\n  auto col_impl = torch_mlu::getMluTensorImpl(col_);\n  auto col_ptr = col_impl->cnnlMalloc();\n\n  // get comput dtype of input\n  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(im.dtype());\n\n  // launch kernel\n  CNLOG(INFO) << \"Launch Kernel MLUKernelMaskedIm2colForward<<<\" << k_dim.x\n              << \", \" << k_dim.y << \", \" << k_dim.z << \">>>\";\n  KernelMaskedIm2colForward(k_dim, k_type, queue, data_type, im_ptr, height,\n                            width, channels, kernel_h, kernel_w, pad_h, pad_w,\n                            mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr);\n\n  col.copy_(col_.permute({2, 1, 0})\n                .reshape({channels * kernel_h * kernel_w, mask_cnt})\n                .contiguous());\n}\n\nvoid MaskedCol2imForwardMLUKernelLauncher(const Tensor col,\n                                          const Tensor mask_h_idx,\n                                          const Tensor mask_w_idx, Tensor im,\n                                          const int height, const int width,\n                                          const int channels) {\n  // Check dtype.\n  TORCH_CHECK(col.scalar_type() == at::kFloat || col.scalar_type() == at::kHalf,\n              \"col type should be Float or Half, got \", col.scalar_type(), \".\");\n  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||\n                  mask_h_idx.scalar_type() == at::kLong,\n              \"mask_h_idx type should be Int or Long, got \",\n              mask_h_idx.scalar_type(), \".\");\n  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||\n                  mask_w_idx.scalar_type() == at::kLong,\n              \"mask_w_idx type should be Int or Long, got \",\n              mask_w_idx.scalar_type(), \".\");\n\n  // zero element check\n  TORCH_CHECK(im.numel() > 0, \"im.numel should greater than zero, got \",\n              im.numel(), \".\");\n  TORCH_CHECK(col.size(0) > 0, \"col.size(0) should greater than zero, got \",\n              col.size(0), \".\");\n\n  // large tensor check\n  const size_t max_input_num = 2147483648;  // 2^31, 2G num\n  TORCH_CHECK(im.numel() < max_input_num,\n              \"im.numel() should be less than 2147483648, got \", im.numel(),\n              \".\");\n  TORCH_CHECK(col.numel() < max_input_num,\n              \"col.numel() should be less than 2147483648, got \", col.numel(),\n              \".\");\n\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());\n  at::Tensor im_ =\n      at::empty({1, channels, height, width}, im.options(), memory_format)\n          .zero_();\n\n  auto col_t = torch_mlu::cnnl::ops::cnnl_contiguous(col.transpose(0, 1));\n\n  const int mask_cnt = mask_h_idx.size(0);\n  // calculate task dimension\n  cnrtDim3_t k_dim;\n  cnrtFunctionType_t k_type;\n  policyFunc(mask_cnt, &k_dim, &k_type);\n\n  // get compute queue\n  auto queue = torch_mlu::getCurQueue();\n  // get ptr of tensors\n  auto im_impl = torch_mlu::getMluTensorImpl(im_);\n  auto im_ptr = im_impl->cnnlMalloc();\n  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);\n  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();\n  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);\n  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();\n  auto col_t_impl = torch_mlu::getMluTensorImpl(col_t);\n  auto col_t_ptr = col_t_impl->cnnlMalloc();\n\n  // get comput dtype of input\n  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(col.dtype());\n\n  // launch kernel\n  CNLOG(INFO) << \"Launch Kernel MLUKernelMaskedCol2imForward<<<\" << k_dim.x\n              << \", \" << k_dim.y << \", \" << k_dim.z << \">>>\";\n\n  KernelMaskedCol2imForward(k_dim, k_type, queue, data_type, col_t_ptr, height,\n                            width, channels, mask_h_idx_ptr, mask_w_idx_ptr,\n                            mask_cnt, im_ptr);\n\n  im.copy_(im_);\n}\n\nvoid masked_im2col_forward_mlu(const Tensor im, const Tensor mask_h_idx,\n                               const Tensor mask_w_idx, Tensor col,\n                               const int kernel_h, const int kernel_w,\n                               const int pad_h, const int pad_w) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)\n  MaskedIm2colForwardMLUKernelLauncher(im, mask_h_idx, mask_w_idx, col,\n                                       kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_mlu(const Tensor col, const Tensor mask_h_idx,\n                               const Tensor mask_w_idx, Tensor im, int height,\n                               int width, int channels) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)\n  MaskedCol2imForwardMLUKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,\n                                       width, channels);\n}\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels);\n\nREGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MLU,\n                     masked_im2col_forward_mlu);\nREGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MLU,\n                     masked_col2im_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\n// Descriptors\nmluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {\n  const std::map<std::string, mluOpDataType_t> mapping_type = {\n      {std::string(\"c10::Half\"), MLUOP_DTYPE_HALF},\n      {std::string(\"float\"), MLUOP_DTYPE_FLOAT},\n      {std::string(\"double\"), MLUOP_DTYPE_DOUBLE},\n      {std::string(\"int8\"), MLUOP_DTYPE_INT8},\n      {std::string(\"signed char\"), MLUOP_DTYPE_INT8},\n      {std::string(\"short int\"), MLUOP_DTYPE_INT16},\n      {std::string(\"short\"), MLUOP_DTYPE_INT16},\n      {std::string(\"int\"), MLUOP_DTYPE_INT32},\n      {std::string(\"long int\"), MLUOP_DTYPE_INT64},\n      {std::string(\"long\"), MLUOP_DTYPE_INT64},\n      {std::string(\"unsigned char\"), MLUOP_DTYPE_UINT8},\n      {std::string(\"bool\"), MLUOP_DTYPE_BOOL},\n      {std::string(\"c10::complex<c10::Half>\"), MLUOP_DTYPE_COMPLEX_HALF},\n      {std::string(\"c10::complex<float>\"), MLUOP_DTYPE_COMPLEX_FLOAT}};\n\n  if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) {\n    return mapping_type.find(std::string(data_type.name()))->second;\n  }\n  return MLUOP_DTYPE_INVALID;\n}\n\n// laytout\nmluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {\n  auto suggest_memory_format = input.suggest_memory_format();\n  mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;\n  switch (input.dim()) {\n    case 4:\n      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast)\n                   ? MLUOP_LAYOUT_NHWC\n                   : MLUOP_LAYOUT_NCHW;\n      break;\n    case 5:\n      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d)\n                   ? MLUOP_LAYOUT_NDHWC\n                   : MLUOP_LAYOUT_NCDHW;\n      break;\n    default:\n      layout = MLUOP_LAYOUT_ARRAY;\n  }\n  return layout;\n}\n\nmluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {\n  const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {\n      {reduce_t::MAX, MLUOP_REDUCE_DMAX},\n      {reduce_t::SUM, MLUOP_REDUCE_DSUM},\n      {reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};\n  if (mapping_type.find(reduce_type) != mapping_type.end()) {\n    return mapping_type.find(reduce_type)->second;\n  } else {\n    TORCH_CHECK(false, \"Unsupported reduce type: \", to_string(reduce_type));\n    return MLUOP_REDUCE_DSUM;\n  }\n}\n\nvoid MluOpTensorDescriptor::set(Tensor t) {\n  mluOpDataType_t data_type = getMluOpDataType(t.dtype());\n  mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);\n  int t_dim = t.dim();\n  std::vector<int> dim_array;\n  if (t_dim == 0) {\n    dim_array.push_back(\n        1);  // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default;\n  } else {\n    for (int i = 0; i < t_dim; i++) {\n      dim_array.push_back(static_cast<int>(t.sizes().vec()[i]));\n    }\n  }\n  set_desc(t, layout, data_type, dim_array);\n}\n\nvoid MluOpTensorDescriptor::set_with_layout(Tensor t,\n                                            mluOpTensorLayout_t layout) {\n  mluOpDataType_t data_type = getMluOpDataType(t.dtype());\n  int t_dim = t.dim();\n  std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());\n  std::vector<int> stride_info =\n      checkUpperBoundAndCastTo<int>(t.strides().vec());\n  if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||\n      layout == MLUOP_LAYOUT_NLC) {\n    convertShapeAndStride(shape_info, stride_info);\n  } else if (layout == MLUOP_LAYOUT_HWCN) {\n    auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,\n                                              std::vector<int>& target_vec,\n                                              std::vector<int>& stride_vec) {\n      // NCHW --> HWCN\n      target_vec[0] = static_cast<int>(vec[2]);\n      target_vec[1] = static_cast<int>(vec[3]);\n      target_vec[2] = static_cast<int>(vec[1]);\n      target_vec[3] = static_cast<int>(vec[0]);\n      // Calculate Stride just like contiguous of HWCN.\n      stride_vec[3] = 1;\n      stride_vec[2] = target_vec[3] * stride_vec[3];\n      stride_vec[1] = target_vec[2] * stride_vec[2];\n      stride_vec[0] = target_vec[1] * stride_vec[1];\n    };\n    convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);\n  }\n  TORCH_CHECK(mluOpSetTensorDescriptorEx(\n                  desc_, layout, data_type, t_dim, shape_info.data(),\n                  stride_info.data()) == MLUOP_STATUS_SUCCESS,\n              \"mluOpSetTensorDescriptorEx execution failed.\");\n}\n\nvoid MluOpTensorDescriptor::set_desc(const at::Tensor& t,\n                                     mluOpTensorLayout_t layout,\n                                     mluOpDataType_t dtype,\n                                     std::vector<int>& dims) {\n  int dimNb = dims.size();\n  TORCH_MLUOP_CHECK(\n      mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()));\n}\n\n// Handles\nstd::once_flag mmcv_mluop_init_flag;\nstd::mutex mmcv_mluop_mutex;\nstatic std::vector<MluOpHandle> mmcv_mluop_handles;\n\nmluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) {\n  std::call_once(mmcv_mluop_init_flag,\n                 []()  // Init mmcv_mluop_handles 1-device <-> 1-handle\n                 {\n                   c10::DeviceIndex num_devices = torch_mlu::device_count();\n                   mmcv_mluop_handles.resize(num_devices);\n                 });\n\n  if (device_index == -1) {\n    device_index = torch_mlu::current_device();\n  }\n  std::lock_guard<std::mutex> mmcv_mluop_guard(mmcv_mluop_mutex);\n  auto queue = torch_mlu::getCurrentQueue(device_index).queue();\n  mmcv_mluop_handles[device_index].setQueue(queue);\n  return mmcv_mluop_handles[device_index].handle;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#pragma once\n#include <ATen/ATen.h>\n#include <c10/core/ScalarType.h>\n\n#include \"aten.h\"\n#include \"mlu_op.h\"\n#include \"pytorch_device_registry.hpp\"\n\n#define MLUOP_MAJOR 0\n#define MLUOP_MINOR 8\n#define MLUOP_PATCHLEVEL 1\n\n/*************************************************************************\n * This MACRO contains operations of simple tensor to mlu-tensor.\n * _contiguous, _desc, _impl, _ptr will be automatically generated in\n * this MACRO.\n *************************************************************************/\n#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \\\n  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \\\n      NAME, NAME.suggest_memory_format());                          \\\n  MluOpTensorDescriptor NAME##_desc;                                \\\n  NAME##_desc.set(NAME##_contigous);                                \\\n  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \\\n  auto NAME##_ptr = NAME##_impl->cnnlMalloc();\n\n#ifndef TORCH_MLUOP_CHECK\n#define TORCH_MLUOP_CHECK(EXPR)                                          \\\n  do {                                                                   \\\n    mluOpStatus_t status = EXPR;                                         \\\n    if (status != MLUOP_STATUS_SUCCESS) {                                \\\n      CNLOG(ERROR) << \"\";                                                \\\n      TORCH_CHECK(false, \"MLUOPS error: \", mluOpGetErrorString(status)); \\\n    }                                                                    \\\n  } while (0);\n#endif\n\nenum class reduce_t { SUM = 0, MEAN = 1, MAX = 2 };\n\ninline std::string to_string(reduce_t reduce_type) {\n  if (reduce_type == reduce_t::MAX) {\n    return \"max\";\n  } else if (reduce_type == reduce_t::MEAN) {\n    return \"mean\";\n  } else if (reduce_type == reduce_t::SUM) {\n    return \"sum\";\n  } else {\n    return \"unknown reduce type\";\n  }\n}\n\nmluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);\nmluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);\nmluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);\n\nclass MluOpTensorDescriptor {\n public:\n  MluOpTensorDescriptor() {\n    TORCH_MLUOP_CHECK(mluOpCreateTensorDescriptor(&desc_));\n  };\n  ~MluOpTensorDescriptor() {\n    TORCH_MLUOP_CHECK(mluOpDestroyTensorDescriptor(desc_));\n  }\n\n  void set(at::Tensor);\n  void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);\n  mluOpTensorDescriptor_t desc() { return desc_; }\n\n private:\n  mluOpTensorDescriptor_t desc_;\n  void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t,\n                std::vector<int>& dims);\n};\n\nmluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1);\n\nclass MluOpHandle {\n public:\n  MluOpHandle() : handle(nullptr) { TORCH_MLUOP_CHECK(mluOpCreate(&handle)); }\n  ~MluOpHandle() {\n    if (handle) {\n      TORCH_MLUOP_CHECK(mluOpDestroy(handle));\n      handle = nullptr;\n    }\n  }\n  void setQueue(cnrtQueue_t queue) {\n    TORCH_MLUOP_CHECK(mluOpSetQueue(handle, queue));\n  }\n  mluOpHandle_t handle;\n};\n\n// modify tensor size and stride order based on\n// channels_first to channels_last or channels_last_3d.\n// which this is not same with pytorch original layout,\n// this real layout is based on data storage real order.\n// example: modify channels_last tensor dim to nhwc tensor desc.\n//            N    C H W  -->   N    H W C\n//          C*H*W  1 W C  --> C*H*W  W C 1\ntemplate <typename T>\nvoid convertShapeAndStride(std::vector<T>& shape_info,\n                           std::vector<T>& stride_info) {\n  TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),\n                  \"shape size need equal to stride size.\");\n  const int dim = shape_info.size();\n  std::vector<T> temp_shape_info(dim);\n  std::vector<T> temp_stride_info(dim);\n  temp_shape_info[0] = shape_info[0];\n  temp_stride_info[0] = stride_info[0];\n  for (size_t i = 0; i < dim - 1; ++i) {\n    const int index = (i + 1) % (dim - 1) + 1;\n    temp_shape_info[i + 1] = shape_info[index];\n    temp_stride_info[i + 1] = stride_info[index];\n  }\n  shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());\n  stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());\n}\n\n// torch tensor provides int64_t type of shape and stride,\n// but mluops descriptor requires type int32.\n// use this function to ensure safe CAST, or report an error.\ntemplate <typename DST_T, typename SRC_T>\nstd::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {\n  std::vector<DST_T> output;\n  output.reserve(input.size());\n  for (const auto& val : input) {\n    if (val > std::numeric_limits<DST_T>::max()) {\n      TORCH_MLU_CHECK(false, \"Requires dim size not greater than \",\n                      std::numeric_limits<DST_T>::max(), \". But got \", val,\n                      \".\");\n    }\n    output.push_back(static_cast<DST_T>(val));\n  }\n  return output;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n#include \"pytorch_device_registry.hpp\"\n#include \"pytorch_mlu_helper.hpp\"\n\nTensor MsDeformAttnForwardLauncher(const Tensor& value,\n                                   const Tensor& spatial_shapes,\n                                   const Tensor& level_start_index,\n                                   const Tensor& sampling_loc,\n                                   const Tensor& attn_weight,\n                                   const int im2col_step) {\n  auto handle = mluOpGetCurrentHandle();\n  const int batch_size = value.size(0);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n  const int num_queries = sampling_loc.size(1);\n  auto output = at::zeros({batch_size, num_queries, num_heads, channels},\n                          value.options());\n  auto spatial_shapes_int = spatial_shapes.to(at::kInt);\n  auto level_start_index_int = level_start_index.to(at::kInt);\n  INITIAL_MLU_PARAM_WITH_TENSOR(output);\n  INITIAL_MLU_PARAM_WITH_TENSOR(value);\n  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);\n  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);\n  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);\n  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);\n\n  TORCH_MLUOP_CHECK(mluOpMsDeformAttnForward(\n      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),\n      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),\n      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,\n      attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),\n      output_ptr));\n\n  output = output.view({batch_size, num_queries, num_heads * channels});\n  return output;\n}\n\nvoid MsDeformAttnBackwardLauncher(\n    const Tensor& value, const Tensor& spatial_shapes,\n    const Tensor& level_start_index, const Tensor& sampling_loc,\n    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,\n    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,\n    const int im2col_step) {\n  auto handle = mluOpGetCurrentHandle();\n  auto spatial_shapes_int = spatial_shapes.to(at::kInt);\n  auto level_start_index_int = level_start_index.to(at::kInt);\n  const int batch_size = value.size(0);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n  const int num_queries = sampling_loc.size(1);\n\n  auto grad_output_dim4 =\n      grad_output.view({batch_size, num_queries, num_heads, channels});\n  // auto grad_output_dim4 = grad_output.view({batch_size, num_queries,\n  // num_heads, channels}).detach();\n  INITIAL_MLU_PARAM_WITH_TENSOR(value);\n  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);\n  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);\n  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);\n  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);\n  // INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);\n\n  mluOpMsDeformAttnBackward(\n      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),\n      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),\n      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,\n      attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),\n      grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,\n      grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,\n      grad_attn_weight_desc.desc(), grad_attn_weight_ptr);\n\n  return;\n}\n\nTensor ms_deform_attn_mlu_forward(const Tensor& value,\n                                  const Tensor& spatial_shapes,\n                                  const Tensor& level_start_index,\n                                  const Tensor& sampling_loc,\n                                  const Tensor& attn_weight,\n                                  const int im2col_step) {\n  return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,\n                                     sampling_loc, attn_weight, im2col_step);\n}\n\nvoid ms_deform_attn_mlu_backward(\n    const Tensor& value, const Tensor& spatial_shapes,\n    const Tensor& level_start_index, const Tensor& sampling_loc,\n    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,\n    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,\n    const int im2col_step) {\n  return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,\n                                      sampling_loc, attn_weight, grad_output,\n                                      grad_value, grad_sampling_loc,\n                                      grad_attn_weight, im2col_step);\n}\n\nTensor ms_deform_attn_impl_forward(const Tensor& value,\n                                   const Tensor& spatial_shapes,\n                                   const Tensor& level_start_index,\n                                   const Tensor& sampling_loc,\n                                   const Tensor& attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor& value, const Tensor& spatial_shapes,\n    const Tensor& level_start_index, const Tensor& sampling_loc,\n    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,\n    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);\n\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,\n                     ms_deform_attn_mlu_forward);\n\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,\n                     ms_deform_attn_mlu_backward);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n\n#include \"mlu_common_helper.h\"\n\nTensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                            int offset) {\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n\n  int max_output_boxes = boxes.size(0);\n\n  // transpose boxes (n, 4) to (4, n) for better performance\n  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);\n  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);\n  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));\n  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));\n\n  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;\n  boxes_desc.set(boxes_);\n  scores_desc.set(scores_);\n  output_desc.set(output);\n\n  // workspace\n  size_t workspace_size = 0;\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(\n      handle, boxes_desc.desc(), scores_desc.desc(), &workspace_size));\n  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));\n\n  // get compute queue\n  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);\n  auto boxes_ptr = boxes_impl->cnnlMalloc();\n  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);\n  auto scores_ptr = scores_impl->cnnlMalloc();\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output);\n  auto output_ptr = output_impl->cnnlMalloc();\n  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);\n  auto output_size_ptr = output_size_impl->cnnlMalloc();\n\n  // nms desc\n  mluOpNmsDescriptor_t nms_desc;\n  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;\n  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;\n  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;\n  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;\n  const float soft_nms_sigma = 0.0;\n  const float confidence_threshold = 0.0;\n  const int input_layout = 0;\n  const bool pad_to_max_output_size = false;\n  const int max_output_size = max_output_boxes;\n\n  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));\n  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(\n      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,\n      soft_nms_sigma, max_output_size, confidence_threshold, (float)offset,\n      input_layout, pad_to_max_output_size));\n\n  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,\n                             scores_desc.desc(), scores_ptr, workspace_ptr,\n                             workspace_size, output_desc.desc(), output_ptr,\n                             output_size_ptr));\n  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));\n  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());\n  auto ret = output.to(boxes.options().dtype(at::kLong));\n  return ret.slice(0, 0, output_num);\n}\n\nTensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\nREGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nTensor nms_rotated_mlu(Tensor boxes, Tensor scores, float iou_threshold) {\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n\n  int boxes_num = boxes.size(0);\n  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);\n  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);\n  auto output = at::empty({boxes_num}, boxes.options().dtype(at::kInt));\n  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));\n\n  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;\n  boxes_desc.set(boxes_);\n  scores_desc.set(scores_);\n  output_desc.set(output);\n\n  // workspace\n  size_t workspace_size = 0;\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize(handle, boxes_desc.desc(),\n                                                    &workspace_size));\n  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));\n\n  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);\n  auto boxes_ptr = boxes_impl->cnnlMalloc();\n  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);\n  auto scores_ptr = scores_impl->cnnlMalloc();\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output);\n  auto output_ptr = output_impl->cnnlMalloc();\n  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);\n  auto output_size_ptr = output_size_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpNmsRotated(\n      handle, iou_threshold, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),\n      scores_ptr, workspace_ptr, workspace_size, output_desc.desc(), output_ptr,\n      (int *)output_size_ptr));\n  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());\n  auto ret = output.to(boxes.options().dtype(at::kLong));\n  return ret.slice(0, 0, output_num);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,\n                                     Tensor y, const int num_,\n                                     const int h_feature, const int w_feature,\n                                     const int h_mask, const int w_mask,\n                                     const int half_h_mask,\n                                     const int half_w_mask) {\n  int y_c = y.size(1);\n\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());\n  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);\n  at::Tensor y_tmp =\n      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);\n\n  MluOpTensorDescriptor x_desc, y_desc;\n  x_desc.set_with_layout(x_tensor, MLUOP_LAYOUT_NHWC);\n  y_desc.set_with_layout(y_tmp, MLUOP_LAYOUT_NHWC);\n\n  auto handle = mluOpGetCurrentHandle();\n  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);\n  auto x_ptr = x_impl->cnnlMalloc();\n  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);\n  auto y_ptr = y_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpPsamaskForward(handle, psa_type, x_desc.desc(), x_ptr,\n                                        h_mask, w_mask, y_desc.desc(), y_ptr));\n\n  y.copy_(y_tmp);\n}\n\nvoid PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,\n                                      Tensor dx, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask) {\n  int dx_c = dx.size(1);\n\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());\n  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);\n  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},\n                                dy.options(), memory_format);\n\n  MluOpTensorDescriptor dy_desc, dx_tmp_desc;\n  dy_desc.set_with_layout(dy_tensor, MLUOP_LAYOUT_NHWC);\n  dx_tmp_desc.set_with_layout(dx_tmp, MLUOP_LAYOUT_NHWC);\n\n  auto handle = mluOpGetCurrentHandle();\n\n  // get ptr of tensors\n  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);\n  auto dx_ptr = dx_impl->cnnlMalloc();\n  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);\n  auto dy_ptr = dy_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpPsamaskBackward(handle, psa_type, dy_desc.desc(),\n                                         dy_ptr, h_mask, w_mask,\n                                         dx_tmp_desc.desc(), dx_ptr));\n\n  dx.copy_(dx_tmp);\n}\n\nvoid psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,\n                         const int num_, const int h_feature,\n                         const int w_feature, const int h_mask,\n                         const int w_mask, const int half_h_mask,\n                         const int half_w_mask) {\n  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,\n                                  w_feature, h_mask, w_mask, half_h_mask,\n                                  half_w_mask);\n}\n\nvoid psamask_backward_mlu(const int psa_type, const Tensor grad_output,\n                          Tensor grad_input, const int num_,\n                          const int h_feature, const int w_feature,\n                          const int h_mask, const int w_mask,\n                          const int half_h_mask, const int half_w_mask) {\n  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,\n                                   h_feature, w_feature, h_mask, w_mask,\n                                   half_h_mask, half_w_mask);\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\n\nREGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);\nREGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2021 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax_y, Tensor argmax_x,\n                                      int aligned_height, int aligned_width,\n                                      float spatial_scale, int sampling_ratio,\n                                      int pool_mode, bool aligned) {\n  // params check\n  TORCH_CHECK(pool_mode == 1, \"pool_mode only supports 'avg' currently\");\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto input_tensor =\n      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);\n\n  auto num_rois = rois.size(0);\n  auto channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  auto output_contiguous =\n      at::empty({num_rois, channels, aligned_height, aligned_width},\n                input.options(), memory_format);\n  // get tensor impl\n  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois);\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n\n  MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,\n      output_desc;\n  input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);\n  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);\n  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);\n\n  // get the mlu ptr\n  auto self_ptr = self_impl->cnnlMalloc();\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  mluOpRoiAlignForwardDescriptor_t roialign_desc;\n  TORCH_MLUOP_CHECK(mluOpCreateRoiAlignForwardDescriptor(&roialign_desc));\n  TORCH_MLUOP_CHECK(mluOpSetRoiAlignForwardDescriptor_v2(\n      roialign_desc, aligned_height, aligned_width, sampling_ratio,\n      spatial_scale, pool_mode, aligned));\n\n  auto handle = mluOpGetCurrentHandle();\n  if (pool_mode == 0) {\n    auto argmax_y_contiguous =\n        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);\n    auto argmax_x_contiguous =\n        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);\n    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);\n    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);\n    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();\n    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();\n    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);\n    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);\n    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(\n        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),\n        rois_ptr, output_desc.desc(), output_ptr, argmax_x_desc.desc(),\n        argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr));\n    argmax_x.copy_(argmax_x_contiguous);\n    argmax_y.copy_(argmax_y_contiguous);\n  } else {\n    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(\n        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),\n        rois_ptr, output_desc.desc(), output_ptr, NULL, NULL, NULL, NULL));\n  }\n  TORCH_MLUOP_CHECK(mluOpDestroyRoiAlignForwardDescriptor(roialign_desc));\n  output.copy_(output_contiguous);\n}\n\nvoid ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       Tensor grad_input, int aligned_height,\n                                       int aligned_width, float spatial_scale,\n                                       int sampling_ratio, int pool_mode,\n                                       bool aligned) {\n  // params check\n  TORCH_CHECK(pool_mode == 1, \"pool_mode only supports 'avg' currently\");\n  int batch_size = grad_input.size(0);\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());\n  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);\n  auto grad_input_ = at::empty({batch_size, channels, height, width},\n                               grad.options(), memory_format)\n                         .zero_();\n\n  int boxes_num = rois.size(0);\n  int hi = grad.size(2);\n  int wi = grad.size(3);\n  int c = grad.size(1);\n\n  int no = grad_input.size(0);\n  int ho = grad_input.size(2);\n  int wo = grad_input.size(3);\n\n  // get tensor impl\n  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);\n  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois);\n\n  // get the mlu ptr\n  auto grad_ptr = grad_impl->cnnlMalloc();\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto grad_input_ptr = grad_input_impl->cnnlMalloc();\n\n  MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,\n      grad_input_desc;\n  grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);\n  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);\n  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);\n\n  auto handle = mluOpGetCurrentHandle();\n  if (pool_mode == 0) {\n    auto argmax_y_contiguous =\n        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);\n    auto argmax_x_contiguous =\n        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);\n    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);\n    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);\n    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();\n    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();\n    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);\n    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);\n    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(\n        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr,\n        argmax_y_desc.desc(), argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,\n        spatial_scale, sampling_ratio, aligned, pool_mode,\n        grad_input_desc.desc(), grad_input_ptr));\n  } else {\n    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(\n        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr, NULL,\n        NULL, NULL, NULL, spatial_scale, sampling_ratio, aligned, pool_mode,\n        grad_input_desc.desc(), grad_input_ptr));\n  }\n  grad_input.copy_(grad_input_);\n}\n\nvoid roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                           int aligned_width, float spatial_scale,\n                           int sampling_ratio, int pool_mode, bool aligned) {\n  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,\n                                   aligned_height, aligned_width, spatial_scale,\n                                   sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                            Tensor argmax_x, Tensor grad_input,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignBackwardMLUKernelLauncher(\n      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,\n      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);\nREGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,\n                                             Tensor output, int pooled_height,\n                                             int pooled_width,\n                                             float spatial_scale,\n                                             int sampling_ratio, bool aligned,\n                                             bool clockwise) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);\n  auto rois_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());\n  auto output_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);\n\n  MluOpTensorDescriptor input_desc, rois_desc, output_desc;\n  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);\n  rois_desc.set(rois_contiguous);\n  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(input_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedForward(\n      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,\n      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,\n      clockwise, output_desc.desc(), output_ptr));\n\n  output.copy_(output_contiguous);\n}\n\nvoid ROIAlignRotatedBackwardMLUKernelLauncher(\n    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,\n    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,\n    bool clockwise) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());\n  auto top_grad_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);\n  auto rois_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());\n  auto bottom_grad_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);\n\n  // get ptr of tensors\n  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);\n  auto top_grad_ptr = top_grad_impl->cnnlMalloc();\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);\n  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();\n\n  MluOpTensorDescriptor top_grad_desc, rois_desc, bottom_grad_desc;\n  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);\n  rois_desc.set(rois_contiguous);\n  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedBackward(\n      handle, top_grad_desc.desc(), top_grad_ptr, rois_desc.desc(), rois_ptr,\n      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,\n      clockwise, bottom_grad_desc.desc(), bottom_grad_ptr));\n  bottom_grad.copy_(bottom_grad_);\n}\n\nvoid roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,\n                                   int aligned_height, int aligned_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   bool aligned, bool clockwise) {\n  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,\n                                          aligned_width, spatial_scale,\n                                          sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,\n                                    Tensor bottom_grad, int aligned_height,\n                                    int aligned_width, float spatial_scale,\n                                    int sampling_ratio, bool aligned,\n                                    bool clockwise) {\n  ROIAlignRotatedBackwardMLUKernelLauncher(\n      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,\n      sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\n\nREGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,\n                     roi_align_rotated_forward_mlu);\nREGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,\n                     roi_align_rotated_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"pytorch_device_registry.hpp\"\n#include \"pytorch_mlu_helper.hpp\"\n\nvoid KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                          cnrtQueue_t queue, cnrtDataType_t data_type,\n                          const void *input_data, const void *input_rois,\n                          const int batch, const int channels, const int height,\n                          const int width, const int pooled_height,\n                          const int pooled_width, const int rois_num,\n                          const float spatial_scale, void *output_data,\n                          int *argmax);\n\nvoid KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,\n                           cnrtQueue_t queue, cnrtDataType_t k_dtype,\n                           const void *grad_output_ptr, const void *rois_ptr,\n                           const int *argmax_ptr, void *grad_input_ptr,\n                           const int box_num, const int pooled_height,\n                           const int pooled_width, const int channels,\n                           const int batch, const int height, const int width,\n                           const float spatial_scale);\n\n// policy function for forward\nstatic void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,\n                              cnrtFunctionType_t *k_type) {\n  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);\n  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);\n  *k_type = CNRT_FUNC_TYPE_UNION1;\n  k_dim->x = core_num;\n  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);\n  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;\n  k_dim->z = 1;\n}\n\nvoid ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                     Tensor argmax, int pooled_height,\n                                     int pooled_width, float spatial_scale) {\n  // Check dtype.\n  TORCH_CHECK(\n      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,\n      \"input type should be Float or Half, got \", input.scalar_type());\n  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),\n              \"rois should have the same type as input\");\n\n  // Check dtype relationship.\n  TORCH_CHECK(\n      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,\n      \"argmax type should be Int or Long, got \", argmax.scalar_type());\n\n  // Check shape.\n  TORCH_CHECK(input.dim() == 4, \"input should be 4d tensor, got \", input.dim(),\n              \"D\");\n  TORCH_CHECK(rois.dim() == 2, \"rois should be 2d tensor, got \", rois.dim(),\n              \"D\");\n  TORCH_CHECK(argmax.dim() == 4, \"argmax should be 4d tensor, got \",\n              argmax.dim(), \"D\");\n\n  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,\n              \"spatial_scale should be within (0, 1], got \", spatial_scale);\n\n  // compute kernel params\n  auto batch = input.size(0);\n  auto height = input.size(2);\n  auto width = input.size(3);\n  auto channels = input.size(1);\n  auto rois_num = output.size(0);\n\n  if (output.numel() == 0) {\n    output = at::zeros({rois_num, channels, pooled_height, pooled_width},\n                       input.options());\n    return;\n  }\n  if (argmax.numel() == 0) {\n    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},\n                       argmax.options());\n    return;\n  }\n\n  // zero element check\n  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||\n      argmax.numel() == 0) {\n    return;\n  }\n\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());\n  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);\n\n  at::Tensor output_ =\n      at::empty({rois_num, channels, pooled_height, pooled_width},\n                input.options(), memory_format);\n  at::Tensor argmax_ =\n      at::empty({rois_num, channels, pooled_height, pooled_width},\n                argmax.options(), memory_format);\n\n  // calculate task dimension\n  cnrtDim3_t k_dim;\n  cnrtFunctionType_t k_type;\n  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);\n\n  // get compute queue\n  auto queue = torch_mlu::getCurQueue();\n\n  // get ptr of tensors\n  auto input_impl = torch_mlu::getMluTensorImpl(input_);\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois);\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_);\n  auto output_ptr = output_impl->cnnlMalloc();\n  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);\n  auto argmax_ptr = argmax_impl->cnnlMalloc();\n\n  // get comput dtype of input\n  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());\n\n  // launch kernel\n  CNLOG(INFO) << \"Launch Kernel MLUKernelRoiPoolForward<<<\" << k_dim.x << \", \"\n              << k_dim.y << \", \" << k_dim.z << \">>>\";\n\n  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,\n                       batch, channels, height, width, pooled_height,\n                       pooled_width, rois_num, spatial_scale, output_ptr,\n                       (int *)argmax_ptr);\n  output.copy_(output_);\n  argmax.copy_(argmax_);\n}\n\n// policy function for backward\nstatic void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {\n  *k_type = CNRT_FUNC_TYPE_UNION1;\n  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);\n  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);\n  k_dim->z = 1;\n}\n\nvoid ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,\n                                      Tensor argmax, Tensor grad_input,\n                                      int pooled_height, int pooled_width,\n                                      float spatial_scale) {\n  // Check dtype.\n  TORCH_CHECK(\n      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,\n      \"argmax type should be Int or Long, got \", argmax.scalar_type());\n  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||\n               grad_output.scalar_type() == at::kHalf),\n              \"grad_output type should be FLoat or Half, got \",\n              grad_output.scalar_type());\n\n  // Check dtype relationship.\n  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),\n              \"rois should have the same type as grad_output\");\n\n  // Check shape.\n  TORCH_CHECK(grad_output.dim() == 4, \"grad_output should be 4d tensor, got \",\n              grad_output.dim(), \"D\");\n  TORCH_CHECK(rois.dim() == 2, \"rois should be 2d tensor, got \", rois.dim(),\n              \"D\");\n  TORCH_CHECK(argmax.dim() == 4, \"argmax should be 4d tensor, got \",\n              argmax.dim(), \"D\");\n\n  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,\n              \"spatial_scale should be within (0, 1], got \", spatial_scale);\n\n  // Check relationship between tensor.\n  // Check the relationship of n.\n  TORCH_CHECK(grad_output.size(0) == rois.size(0),\n              \"grad_output.size(0) = \", grad_output.size(0),\n              \", while rois.size(0) = \", rois.size(0),\n              \". They should be the same.\");\n\n  // Check the relationship of channels.\n  TORCH_CHECK(grad_output.size(1) == argmax.size(1),\n              \"grad_output.size(1) = \", grad_output.size(1),\n              \", while argmax.size(1) = \", argmax.size(1),\n              \". They should be the same.\");\n\n  // Check the relationship of height and width.\n  TORCH_CHECK(grad_output.size(2) == argmax.size(2),\n              \"argmax.size(2) = \", argmax.size(2),\n              \", while grad_output.size(2) = \", grad_output.size(2),\n              \". They should be the same.\");\n  TORCH_CHECK(grad_output.size(3) == argmax.size(3),\n              \"argmax.size(3) = \", argmax.size(3),\n              \", while grad_output.size(3) = \", grad_output.size(3),\n              \". They should be the same.\");\n\n  // Check zero element.\n  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||\n      grad_input.numel() == 0) {\n    // return if zero-element\n    return;\n  }\n\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());\n  auto grad_output_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);\n  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);\n\n  int boxes_num = grad_output.size(0);\n  int no = grad_input.size(0);\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n  auto grad_input_ = at::empty({no, channels, height, width},\n                               grad_input.options(), memory_format)\n                         .zero_();\n\n  // get tensor impl\n  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois);\n  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);\n  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);\n\n  // get compute queue\n  auto queue = torch_mlu::getCurQueue();\n\n  // get mlu ptr\n  auto grad_output_ptr = grad_output_impl->cnnlMalloc();\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto argmax_ptr = argmax_impl->cnnlMalloc();\n  auto grad_input_ptr = grad_input_impl->cnnlMalloc();\n\n  // calculate task dimension\n  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());\n  cnrtDim3_t k_dim;\n  cnrtFunctionType_t k_type;\n  policyFuncBackward(&k_dim, &k_type);\n\n  CNLOG(INFO) << \"Launch Kernel MLUKernelRoiPoolBackward<<<\" << k_dim.x << \", \"\n              << k_dim.y << \", \" << k_dim.z << \">>>\";\n\n  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,\n                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,\n                        pooled_height, pooled_width, channels, no, height,\n                        width, spatial_scale);\n\n  grad_input.copy_(grad_input_);\n}\n\nvoid roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,\n                          Tensor argmax, int pooled_height, int pooled_width,\n                          float spatial_scale) {\n  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,\n                                  pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,\n                           Tensor grad_input, int pooled_height,\n                           int pooled_width, float spatial_scale) {\n  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,\n                                   pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\n\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\n\nREGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);\nREGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid RoiawarePool3dForwardMLUKernelLauncher(\n    const int pool_method, const int boxes_num, const int pts_num,\n    const int channels, const int max_pts_each_voxel, const int out_x,\n    const int out_y, const int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,\n    Tensor argmax) {\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  auto rois_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());\n  auto pts_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(pts, pts.suggest_memory_format());\n  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pts_feature, pts_feature.suggest_memory_format());\n  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      argmax, argmax.suggest_memory_format());\n  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());\n  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pooled_features, pooled_features.suggest_memory_format());\n\n  MluOpTensorDescriptor rois_desc, pts_desc, pts_feature_desc, argmax_desc,\n      pts_idx_of_voxels_desc, pooled_features_desc;\n  rois_desc.set(rois_contiguous);\n  pts_desc.set(pts_contiguous);\n  pts_feature_desc.set(pts_feature_contiguous);\n  argmax_desc.set(argmax_contiguous);\n  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);\n  pooled_features_desc.set(pooled_features_contiguous);\n\n  // allocate extra space for workspace\n  size_t workspace_size = 0;\n  TORCH_MLUOP_CHECK(mluOpGetRoiawarePool3dForwardWorkspaceSize(\n      handle, rois_desc.desc(), pts_desc.desc(), pts_feature_desc.desc(),\n      &workspace_size));\n\n  auto workspace = at::empty(workspace_size, rois.options().dtype(at::kByte));\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n\n  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);\n  auto pts_impl = torch_mlu::getMluTensorImpl(pts_contiguous);\n  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);\n  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);\n  auto pts_idx_of_voxels_impl =\n      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);\n  auto pooled_features_impl =\n      torch_mlu::getMluTensorImpl(pooled_features_contiguous);\n\n  auto rois_ptr = rois_impl->cnnlMalloc();\n  auto pts_ptr = pts_impl->cnnlMalloc();\n  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();\n  auto argmax_ptr = argmax_impl->cnnlMalloc();\n  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();\n  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();\n\n  CNLOG(INFO) << \"Call mluOpRoiawarePool3dForward().\";\n  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dForward(\n      handle, pool_method, boxes_num, pts_num, channels, rois_desc.desc(),\n      rois_ptr, pts_desc.desc(), pts_ptr, pts_feature_desc.desc(),\n      pts_feature_ptr, workspace_ptr, workspace_size, max_pts_each_voxel, out_x,\n      out_y, out_z, argmax_desc.desc(), argmax_ptr,\n      pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,\n      pooled_features_desc.desc(), pooled_features_ptr));\n}\n\nvoid roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,\n                                 int max_pts_each_voxel, int out_x, int out_y,\n                                 int out_z, const Tensor rois, const Tensor pts,\n                                 const Tensor pts_feature, Tensor argmax,\n                                 Tensor pts_idx_of_voxels,\n                                 Tensor pooled_features, int pool_method) {\n  RoiawarePool3dForwardMLUKernelLauncher(\n      pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,\n      out_y, out_z, rois, pts, pts_feature, pts_idx_of_voxels, pooled_features,\n      argmax);\n}\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method);\n\nREGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,\n                     roiaware_pool3d_forward_mlu);\n\nvoid RoiawarePool3dBackwardMLUKernelLauncher(\n    int pool_method, int boxes_num, int out_x, int out_y, int out_z,\n    int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,\n    const Tensor argmax, const Tensor grad_out, Tensor grad_in) {\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());\n  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      argmax, argmax.suggest_memory_format());\n  auto grad_out_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_out, grad_out.suggest_memory_format());\n  auto grad_in_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_in, grad_in.suggest_memory_format());\n\n  MluOpTensorDescriptor pts_idx_of_voxels_desc, argmax_desc, grad_out_desc,\n      grad_in_desc;\n\n  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);\n  argmax_desc.set(argmax_contiguous);\n  grad_out_desc.set(grad_out_contiguous);\n  grad_in_desc.set(grad_in_contiguous);\n\n  auto pts_idx_of_voxels_impl =\n      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);\n  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);\n  auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out_contiguous);\n  auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in_contiguous);\n\n  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();\n  auto argmax_ptr = argmax_impl->cnnlMalloc();\n  auto grad_out_ptr = grad_out_impl->cnnlMalloc();\n  auto grad_in_ptr = grad_in_impl->cnnlMalloc();\n\n  CNLOG(INFO) << \"Call mluOpRoiawarePool3dBackward().\";\n  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dBackward(\n      handle, pool_method, boxes_num, out_x, out_y, out_z, channels,\n      max_pts_each_voxel, pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,\n      argmax_desc.desc(), argmax_ptr, grad_out_desc.desc(), grad_out_ptr,\n      grad_in_desc.desc(), grad_in_ptr));\n}\n\nvoid roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,\n                                  int out_z, int channels,\n                                  int max_pts_each_voxel,\n                                  const Tensor pts_idx_of_voxels,\n                                  const Tensor argmax, const Tensor grad_out,\n                                  Tensor grad_in, int pool_method) {\n  RoiawarePool3dBackwardMLUKernelLauncher(\n      pool_method, boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,\n      pts_idx_of_voxels, argmax, grad_out, grad_in);\n}\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method);\n\nREGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MLU,\n                     roiaware_pool3d_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid RoIPointPool3dForwardMLUKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features,\n    Tensor pooled_empty_flag) {\n  // check datatype\n  TORCH_CHECK(((xyz.scalar_type() == pooled_features.scalar_type()) &&\n               (boxes3d.scalar_type() == pooled_features.scalar_type()) &&\n               (pts_feature.scalar_type() == pooled_features.scalar_type())),\n              \"data types of xyz, boxes3d, pts_feature and pooled_features \"\n              \"should be the same, \",\n              \"but now xyz type is \", xyz.scalar_type(), \", boxes3d type is \",\n              boxes3d.scalar_type(), \", pts_feature type is \",\n              pts_feature.scalar_type(), \", pooled_features type is \",\n              pooled_features.scalar_type(), \".\");\n  TORCH_CHECK(\n      (xyz.scalar_type() == at::kFloat || xyz.scalar_type() == at::kHalf),\n      \"xyz type should be Float or Half, got \", xyz.scalar_type(), \".\");\n  TORCH_CHECK((pooled_empty_flag.scalar_type() == at::kInt),\n              \"pooled_empty_flag type should be Int, got \",\n              pooled_empty_flag.scalar_type(), \".\");\n\n  // check shape\n  TORCH_CHECK(boxes3d.dim() == 3, \"boxes3d should be a 3d tensor, got \",\n              boxes3d.dim(), \"D.\");\n  TORCH_CHECK(pts_feature.dim() == 3, \"pts_feature should be a 3d tensor, got \",\n              pts_feature.dim(), \"D.\");\n\n  TORCH_CHECK(boxes3d.size(2) == 7,\n              \"the 3rd dimensions of boxes3d should be 7, got \",\n              boxes3d.size(2), \".\");\n  TORCH_CHECK((boxes3d.size(0) == batch_size),\n              \"the 1st dimensions of boxes3d should be batch_size, \",\n              \"but now the 1st dimension of boxes3d is \", boxes3d.size(0),\n              \", and batch_size is \", batch_size, \".\");\n  TORCH_CHECK((pts_feature.size(0) == batch_size),\n              \"the 1st dimensions of pts_feature should be batch_size, \",\n              \"but now the 1st dimension of pts_feature is \",\n              pts_feature.size(0), \", and batch_size is \", batch_size, \".\");\n  TORCH_CHECK((pts_feature.size(1) == pts_num),\n              \"the 2nd dimensions of pts_feature should be pts_num, \",\n              \"but now the 2nd dimension of pts_feature is \",\n              pts_feature.size(1), \", and pts_num is \", pts_num, \".\");\n\n  // check zero element\n  if (xyz.numel() == 0 || pts_feature.numel() == 0 || boxes3d.numel() == 0 ||\n      pooled_features.numel() == 0 || pooled_empty_flag.numel() == 0) {\n    return;\n  }\n\n  // large tensor check\n  const size_t max_input_size = 2147483648;\n  TORCH_CHECK(xyz.numel() < max_input_size,\n              \"xyz element num should be less than 2^31, got \", xyz.numel(),\n              \".\");\n  TORCH_CHECK(boxes3d.numel() < max_input_size,\n              \"boxes3d element num should be less than 2^31, got \",\n              boxes3d.numel(), \".\");\n  TORCH_CHECK(pts_feature.numel() < max_input_size,\n              \"pts_feature element num should be less than 2^31, got \",\n              pts_feature.numel(), \".\");\n\n  // set contiguous\n  auto xyz_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(xyz, xyz.suggest_memory_format());\n  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pts_feature, pts_feature.suggest_memory_format());\n  auto boxes3d_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      boxes3d, boxes3d.suggest_memory_format());\n  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pooled_features, pooled_features.suggest_memory_format());\n  auto pooled_empty_flag_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      pooled_empty_flag, pooled_empty_flag.suggest_memory_format());\n\n  // get ptr of tensors\n  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);\n  auto xyz_ptr = xyz_impl->cnnlMalloc();\n  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);\n  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();\n  auto boxes3d_impl = torch_mlu::getMluTensorImpl(boxes3d_contiguous);\n  auto boxes3d_ptr = boxes3d_impl->cnnlMalloc();\n  auto pooled_features_impl =\n      torch_mlu::getMluTensorImpl(pooled_features_contiguous);\n  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();\n  auto pooled_empty_flag_impl =\n      torch_mlu::getMluTensorImpl(pooled_empty_flag_contiguous);\n  auto pooled_empty_flag_ptr = pooled_empty_flag_impl->cnnlMalloc();\n\n  // create tensor descriptors\n  MluOpTensorDescriptor xyz_desc, pts_feature_desc, boxes3d_desc,\n      pooled_features_desc, pooled_empty_flag_desc;\n  xyz_desc.set(xyz_contiguous);\n  pts_feature_desc.set(pts_feature_contiguous);\n  boxes3d_desc.set(boxes3d_contiguous);\n  pooled_features_desc.set(pooled_features_contiguous);\n  pooled_empty_flag_desc.set(pooled_empty_flag_contiguous);\n\n  // get workspace\n  size_t workspace_size = 0;\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpGetRoiPointPool3dWorkspaceSize(\n      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,\n      xyz_desc.desc(), pts_feature_desc.desc(), boxes3d_desc.desc(),\n      pooled_features_desc.desc(), pooled_empty_flag_desc.desc(),\n      &workspace_size));\n\n  auto workspace = at::empty(workspace_size, xyz.options().dtype(at::kByte));\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n  TORCH_MLUOP_CHECK(mluOpRoiPointPool3d(\n      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,\n      xyz_desc.desc(), xyz_ptr, pts_feature_desc.desc(), pts_feature_ptr,\n      boxes3d_desc.desc(), boxes3d_ptr, workspace_ptr, workspace_size,\n      pooled_features_desc.desc(), pooled_features_ptr,\n      pooled_empty_flag_desc.desc(), (int *)pooled_empty_flag_ptr));\n}\n\nvoid roipoint_pool3d_forward_mlu(int batch_size, int pts_num, int boxes_num,\n                                 int feature_in_len, int sampled_pts_num,\n                                 const Tensor xyz, const Tensor boxes3d,\n                                 const Tensor pts_feature,\n                                 Tensor pooled_features,\n                                 Tensor pooled_empty_flag) {\n  RoIPointPool3dForwardMLUKernelLauncher(\n      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,\n      boxes3d, pts_feature, pooled_features, pooled_empty_flag);\n}\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag);\n\nREGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MLU,\n                     roipoint_pool3d_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid RotatedFeatureAlignForwardMLUKernelLauncher(const Tensor features,\n                                                 const Tensor best_bboxes,\n                                                 const float spatial_scale,\n                                                 const int points,\n                                                 Tensor output) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(features.dim());\n  auto features_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(features, memory_format);\n  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      best_bboxes, best_bboxes.suggest_memory_format());\n  auto output_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);\n\n  MluOpTensorDescriptor features_desc, best_bboxes_desc, output_desc;\n  features_desc.set_with_layout(features_, MLUOP_LAYOUT_NHWC);\n  best_bboxes_desc.set(best_bboxes_contiguous);\n  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);\n\n  // get ptr of tensors\n  auto features_impl = torch_mlu::getMluTensorImpl(features_);\n  auto features_ptr = features_impl->cnnlMalloc();\n  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);\n  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignForward(\n      handle, features_desc.desc(), features_ptr, best_bboxes_desc.desc(),\n      best_bboxes_ptr, spatial_scale, points, output_desc.desc(), output_ptr));\n\n  output.copy_(output_contiguous);\n}\n\nvoid RotatedFeatureAlignBackwardMLUKernelLauncher(const Tensor top_grad,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor bottom_grad) {\n  auto memory_format =\n      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());\n  auto top_grad_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);\n  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      best_bboxes, best_bboxes.suggest_memory_format());\n  auto bottom_grad_ =\n      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);\n\n  // get ptr of tensors\n  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);\n  auto top_grad_ptr = top_grad_impl->cnnlMalloc();\n  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);\n  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();\n  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);\n  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();\n\n  MluOpTensorDescriptor top_grad_desc, best_bboxes_desc, bottom_grad_desc;\n  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);\n  best_bboxes_desc.set(best_bboxes_contiguous);\n  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignBackward(\n      handle, top_grad_desc.desc(), top_grad_ptr, best_bboxes_desc.desc(),\n      best_bboxes_ptr, spatial_scale, points, bottom_grad_desc.desc(),\n      bottom_grad_ptr));\n  bottom_grad.copy_(bottom_grad_);\n}\n\nvoid rotated_feature_align_forward_mlu(const Tensor features,\n                                       const Tensor best_bboxes,\n                                       const float spatial_scale,\n                                       const int points, Tensor output) {\n  RotatedFeatureAlignForwardMLUKernelLauncher(features, best_bboxes,\n                                              spatial_scale, points, output);\n}\n\nvoid rotated_feature_align_backward_mlu(const Tensor top_grad,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor bottom_grad) {\n  RotatedFeatureAlignBackwardMLUKernelLauncher(\n      top_grad, best_bboxes, spatial_scale, points, bottom_grad);\n}\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MLU,\n                     rotated_feature_align_forward_mlu);\nREGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MLU,\n                     rotated_feature_align_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2023 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nstd::vector<Tensor> dynamic_point_to_voxel_forward_mlu(\n    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type) {\n  // params check\n  TORCH_CHECK(feats.scalar_type() == at::kFloat,\n              \"feats type should be Float, got \", feats.scalar_type());\n  TORCH_CHECK(coors.scalar_type() == at::kInt,\n              \"coors type should be Int32, got \", coors.scalar_type());\n  TORCH_CHECK(feats.size(0) == coors.size(0),\n              \"feats.dim(0) and coors.dim(0) should be same, got \",\n              feats.size(0), \" vs \", coors.size(0));\n\n  const int num_input = feats.size(0);\n  const int num_feats = feats.size(1);\n  // zero-element check\n  if (num_input == 0)\n    return {feats.clone().detach(), coors.clone().detach(),\n            coors.new_empty({0}, torch::kInt32),\n            coors.new_empty({0}, torch::kInt32)};\n\n  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);\n  auto reduced_feats = at::empty({num_input, num_feats}, feats.options());\n  auto out_coors = at::empty({num_input, 3}, coors.options());\n  auto coors_map = at::empty({num_input}, coors.options());\n  auto reduce_count = at::empty({num_input}, coors.options());\n  auto voxel_num = at::empty({1}, coors.options());\n\n  INITIAL_MLU_PARAM_WITH_TENSOR(feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(coors);\n  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(out_coors);\n  INITIAL_MLU_PARAM_WITH_TENSOR(coors_map);\n  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);\n  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  size_t workspace_size;\n  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelForwardWorkspaceSize(\n      handle, feats_desc.desc(), coors_desc.desc(), &workspace_size));\n  auto workspace_tensor =\n      at::empty(workspace_size, feats.options().dtype(at::kByte));\n  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);\n\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelForward(\n      handle, mlu_reduce_type, feats_desc.desc(), feats_ptr, coors_desc.desc(),\n      coors_ptr, workspace_tensor_ptr, workspace_size,\n      reduced_feats_desc.desc(), reduced_feats_ptr, out_coors_desc.desc(),\n      out_coors_ptr, coors_map_desc.desc(), coors_map_ptr,\n      reduce_count_desc.desc(), reduce_count_ptr, voxel_num_desc.desc(),\n      voxel_num_ptr));\n\n  int voxel_num_value = *static_cast<int *>(voxel_num.cpu().data_ptr());\n  TORCH_CHECK(voxel_num_value <= feats.size(0),\n              \"voxel_num should be less than or equal to feats_num, got \",\n              voxel_num_value, \" vs \", feats.size(0));\n  return {reduced_feats.slice(0, 0, voxel_num_value),\n          out_coors.slice(0, 0, voxel_num_value), coors_map,\n          reduce_count.slice(0, 0, voxel_num_value)};\n}\n\nvoid dynamic_point_to_voxel_backward_mlu(\n    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,\n    const Tensor &reduced_feats, const Tensor &coors_idx,\n    const Tensor &reduce_count, const reduce_t reduce_type) {\n  // params check\n  TORCH_CHECK(grad_reduced_feats.scalar_type() == at::kFloat,\n              \"grad_reduced_feats type should be Float, got \",\n              grad_reduced_feats.scalar_type());\n  TORCH_CHECK(feats.scalar_type() == at::kFloat,\n              \"feats type should be Float, got \", feats.scalar_type());\n  TORCH_CHECK(reduced_feats.scalar_type() == at::kFloat,\n              \"reduced_feats type should be Float, got \",\n              reduced_feats.scalar_type());\n  TORCH_CHECK(coors_idx.scalar_type() == at::kInt,\n              \"coors_idx type should be Int32, got \", coors_idx.scalar_type());\n  TORCH_CHECK(reduce_count.scalar_type() == at::kInt,\n              \"reduce_count type should be Int32, got \",\n              reduce_count.scalar_type());\n\n  const int num_input = feats.size(0);\n  const int num_reduced = reduced_feats.size(0);\n  const int num_feats = feats.size(1);\n\n  grad_feats.fill_(0);\n\n  // zero-element check\n  if (num_input == 0 || num_reduced == 0) return;\n\n  // TODO(miaochen): remove this after mlu-ops supports other mode of reduce.\n  TORCH_CHECK(reduce_type == reduce_t::MAX,\n              \"only supports max reduce in current version, got \",\n              to_string(reduce_type));\n\n  int voxel_num_value = reduced_feats.size(0);\n  auto opts = torch::TensorOptions().dtype(torch::kInt32);\n  auto voxel_num =\n      torch::from_blob(&voxel_num_value, {1}, opts).clone().to(at::kMLU);\n  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);\n\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(grad_reduced_feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);\n  INITIAL_MLU_PARAM_WITH_TENSOR(coors_idx);\n  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);\n  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);\n\n  // get compute handle\n  auto handle = mluOpGetCurrentHandle();\n\n  size_t workspace_size;\n  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(\n      handle, mlu_reduce_type, grad_feats_desc.desc(), feats_desc.desc(),\n      grad_reduced_feats_desc.desc(), coors_idx_desc.desc(),\n      reduce_count_desc.desc(), voxel_num_desc.desc(), &workspace_size));\n  auto workspace_tensor =\n      at::empty(workspace_size, feats.options().dtype(at::kByte));\n  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);\n\n  // launch kernel\n  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelBackward(\n      handle, mlu_reduce_type, grad_reduced_feats_desc.desc(),\n      grad_reduced_feats_ptr, feats_desc.desc(), feats_ptr,\n      reduced_feats_desc.desc(), reduced_feats_ptr, coors_idx_desc.desc(),\n      coors_idx_ptr, reduce_count_desc.desc(), reduce_count_ptr,\n      voxel_num_desc.desc(), voxel_num_ptr, workspace_tensor_ptr,\n      workspace_size, grad_feats_desc.desc(), grad_feats_ptr));\n}\n\nstd::vector<Tensor> dynamic_point_to_voxel_forward_impl(\n    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type);\n\nvoid dynamic_point_to_voxel_backward_impl(\n    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,\n    const Tensor &reduced_feats, const Tensor &coors_idx,\n    const Tensor &reduce_count, const reduce_t reduce_type);\n\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MLU,\n                     dynamic_point_to_voxel_forward_mlu);\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MLU,\n                     dynamic_point_to_voxel_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include <torch/script.h>\n\n#include <vector>\n\n#include \"mlu_common_helper.h\"\n#include \"pytorch_device_registry.hpp\"\n#include \"pytorch_mlu_helper.hpp\"\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  // The following code is copied from\n  // mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu to ensure the output is\n  // available for network train. The outputs of this function have correct\n  // shape but wrong value.\n  auto numAct = indices.size(0);\n  auto kernelVolume = kernelSize[0];\n  int sub_m = (int)_subM;\n  int transpose = (int)_transpose;\n  int batch = (int)batchSize;\n  auto coorDim = indices.size(1) - 1;\n\n  for (int i = 1; i < kernelSize.size(); ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n\n  auto outputVolume = outSpatialShape[0];\n  for (int i = 1; i < outSpatialShape.size(); ++i) {\n    outputVolume *= outSpatialShape[i];\n  }\n  torch::Tensor indicePairs = at::full({kernelVolume, 2, numAct}, -1,\n                                       indices.options().dtype(at::kInt));\n  torch::Tensor indiceNum =\n      at::zeros({kernelVolume}, indices.options().dtype(at::kInt));\n  int out_size = sub_m == 1\n                     ? numAct\n                     : std::min(numAct * kernelVolume, batch * outputVolume);\n  torch::Tensor out_indices =\n      at::zeros({out_size, coorDim + 1}, indices.options().dtype(at::kInt));\n  auto indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      indices, at::MemoryFormat::Contiguous);\n  auto indicePairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      indicePairs, at::MemoryFormat::Contiguous);\n  auto indiceNum_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      indiceNum, at::MemoryFormat::Contiguous);\n  auto out_indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      out_indices, at::MemoryFormat::Contiguous);\n\n  std::vector<int> input_space;\n  std::vector<int> filter_space;\n  std::vector<int> output_space;\n  std::vector<int> padding32;\n  std::vector<int> stride32;\n  std::vector<int> dilation32;\n  for (int i = 0; i < NDim; i++) {\n    input_space.push_back(spatialShape[i]);\n    filter_space.push_back(kernelSize[i]);\n    output_space.push_back(outSpatialShape[i]);\n    padding32.push_back(padding[i]);\n    stride32.push_back(stride[i]);\n    dilation32.push_back(dilation[i]);\n  }\n  MluOpTensorDescriptor indices_desc, out_indices_desc, indicePairs_desc,\n      indiceNum_desc;\n  indices_desc.set(indices_contiguous);\n  indicePairs_desc.set(indicePairs_contiguous);\n  indiceNum_desc.set(indiceNum_contiguous);\n  out_indices_desc.set(out_indices_contiguous);\n  {\n    mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;\n    mluOpDataType_t dtype = MLUOP_DTYPE_INT32;\n    std::vector<int> dims;\n    dims = {numAct, coorDim + 1};\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        indices_desc.desc(), layout, dtype, dims.size(), dims.data()));\n    dims = {kernelVolume, 2, numAct};\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        indicePairs_desc.desc(), layout, dtype, dims.size(), dims.data()));\n    dims = {kernelVolume};\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        indiceNum_desc.desc(), layout, dtype, dims.size(), dims.data()));\n    dims = {out_size, coorDim + 1};\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        out_indices_desc.desc(), layout, dtype, dims.size(), dims.data()));\n  }\n\n  mluOpSparseConvolutionDescriptor_t sparse_conv_desc;\n  TORCH_MLUOP_CHECK(mluOpCreateSparseConvolutionDescriptor(&sparse_conv_desc));\n  TORCH_MLUOP_CHECK(mluOpSetSparseConvolutionDescriptor(\n      sparse_conv_desc, NDim + 2, batch, padding32.data(), stride32.data(),\n      dilation32.data(), input_space.data(), filter_space.data(),\n      output_space.data(), sub_m, transpose, 0));\n\n  auto handle = mluOpGetCurrentHandle();\n  size_t workspace_size = 0;\n  TORCH_MLUOP_CHECK(mluOpGetIndicePairsWorkspaceSize(\n      handle, sparse_conv_desc, indices_desc.desc(), indicePairs_desc.desc(),\n      out_indices_desc.desc(), indiceNum_desc.desc(), &workspace_size));\n  auto indice_workspace_size =\n      at::empty(workspace_size, indices.options().dtype(at::kByte));\n\n  auto indices_impl = torch_mlu::getMluTensorImpl(indices_contiguous);\n  auto out_indices_impl = torch_mlu::getMluTensorImpl(out_indices_contiguous);\n  auto indicePairs_impl = torch_mlu::getMluTensorImpl(indicePairs_contiguous);\n  auto indiceNum_impl = torch_mlu::getMluTensorImpl(indiceNum_contiguous);\n  auto indice_workspace_impl =\n      torch_mlu::getMluTensorImpl(indice_workspace_size);\n\n  auto indices_ptr = indices_impl->cnnlMalloc();\n  auto out_indices_ptr = out_indices_impl->cnnlMalloc();\n  auto indicePairs_ptr = indicePairs_impl->cnnlMalloc();\n  auto indiceNum_ptr = indiceNum_impl->cnnlMalloc();\n  auto indice_workspace_ptr = indice_workspace_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpGetIndicePairs(\n      handle, sparse_conv_desc, indices_desc.desc(), indices_ptr,\n      indice_workspace_ptr, workspace_size, indicePairs_desc.desc(),\n      indicePairs_ptr, out_indices_desc.desc(), out_indices_ptr,\n      indiceNum_desc.desc(), indiceNum_ptr));\n  int num_act_out = 0;\n  TORCH_MLUOP_CHECK(\n      mluOpGetSparseConvolutionNumActOut(sparse_conv_desc, &num_act_out));\n  TORCH_MLUOP_CHECK(mluOpDestroySparseConvolutionDescriptor(sparse_conv_desc));\n  if (!sub_m) {\n    return {out_indices.slice(0, 0, num_act_out), indicePairs, indiceNum};\n  } else {\n    return {indices, indicePairs, indiceNum};\n  }\n}\n\ntorch::Tensor IndiceConvForwardMLUKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,\n    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,\n    int64_t _subM) {\n  auto indice_num_cpu = indiceNum.to({torch::kCPU});\n  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);\n  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();\n\n  // generate empty output\n  int C = filters.dim() == 4 ? filters.size(3) : filters.size(4);\n  torch::Tensor output =\n      at::zeros({numActOut, C}, features.options().dtype(at::kFloat));\n  // generate descriptor\n  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      features, at::MemoryFormat::Contiguous);\n  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      filters, at::MemoryFormat::Contiguous);\n  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      indicePairs, at::MemoryFormat::Contiguous);\n  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      output, at::MemoryFormat::Contiguous);\n\n  MluOpTensorDescriptor features_desc, filters_desc, indice_pairs_desc,\n      output_desc;\n  features_desc.set(features_contiguous);\n  filters_desc.set(filters_contiguous);\n  indice_pairs_desc.set(indice_pairs_contiguous);\n  output_desc.set(output_contiguous);\n\n  // set layout\n  {\n    mluOpTensorLayout_t layout;\n    mluOpDataType_t dtype;\n    int dim;\n    int dims[8];\n\n    // features_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // filters_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // indice_pairs_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),\n                                               &layout, &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // output_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        output_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n  }\n\n  auto handle = mluOpGetCurrentHandle();\n  size_t workspace_size = 0;\n  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionForwardWorkspaceSize(\n      handle, features_desc.desc(), filters_desc.desc(),\n      indice_pairs_desc.desc(), output_desc.desc(), indice_num, numActOut,\n      _inverse, _subM, &workspace_size));\n\n  auto workspace =\n      at::empty(workspace_size, features.options().dtype(at::kByte));\n\n  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);\n  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);\n  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);\n  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);\n\n  auto features_ptr = features_impl->cnnlMalloc();\n  auto filters_ptr = filters_impl->cnnlMalloc();\n  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n\n  //  outputs\n  auto output_impl = torch_mlu::getMluTensorImpl(output);\n  auto output_ptr = output_impl->cnnlMalloc();\n  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionForward(\n      handle, features_desc.desc(), features_ptr, filters_desc.desc(),\n      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,\n      numActOut, _inverse, _subM, workspace_ptr, workspace_size,\n      output_desc.desc(), output_ptr));\n\n  return output;\n}\n\nstd::vector<torch::Tensor> IndiceConvBackwardMLUKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  auto indice_num_cpu = indiceNum.to({torch::kCPU});\n  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);\n  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();\n\n  // generate empty input_grad\n  torch::Tensor input_grad = at::zeros({features.size(0), features.size(1)},\n                                       features.options().dtype(at::kFloat));\n  torch::Tensor filters_grad;\n  if (filters.dim() == 4) {\n    int h = filters.size(0);\n    int w = filters.size(1);\n    int c = filters.size(2);\n    int n = filters.size(3);\n    filters_grad = at::zeros({h, w, c, n}, filters.options().dtype(at::kFloat));\n  } else if (filters.dim() == 5) {\n    int d = filters.size(0);\n    int h = filters.size(1);\n    int w = filters.size(2);\n    int c = filters.size(3);\n    int n = filters.size(4);\n    filters_grad =\n        at::zeros({d, h, w, c, n}, filters.options().dtype(at::kFloat));\n  }\n\n  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      features, at::MemoryFormat::Contiguous);\n  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      filters, at::MemoryFormat::Contiguous);\n  auto output_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      outGrad, at::MemoryFormat::Contiguous);\n  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      indicePairs, at::MemoryFormat::Contiguous);\n  auto input_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      features, at::MemoryFormat::Contiguous);\n  auto filters_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      filters, at::MemoryFormat::Contiguous);\n\n  MluOpTensorDescriptor features_desc, output_grad_desc, filters_desc,\n      indice_pairs_desc, input_grad_desc, filters_grad_desc;\n  features_desc.set(features_contiguous);\n  filters_desc.set(filters_contiguous);\n  output_grad_desc.set(output_grad_contiguous);\n  indice_pairs_desc.set(indice_pairs_contiguous);\n  input_grad_desc.set(input_grad_contiguous);\n  filters_grad_desc.set(filters_grad_contiguous);\n\n  // need to set desc layout with mluOp functions\n  {\n    mluOpTensorLayout_t layout;\n    mluOpDataType_t dtype;\n    int dim;\n    int dims[8];\n\n    // features_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // filters_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    if (dim == 4) {\n      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n          filters_desc.desc(), MLUOP_LAYOUT_HWCN, dtype, dim, dims));\n    } else {\n      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n          filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n    }\n\n    // output_grad_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_grad_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        output_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // indice_pairs_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),\n                                               &layout, &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n\n    // input_grad_desc\n    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(input_grad_desc.desc(), &layout,\n                                               &dtype, &dim, dims));\n    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(\n        input_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));\n  }\n\n  auto handle = mluOpGetCurrentHandle();\n  size_t data_workspace_size = 0;\n  mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(\n      handle, output_grad_desc.desc(), filters_desc.desc(),\n      indice_pairs_desc.desc(), input_grad_desc.desc(), indice_num, _inverse,\n      &data_workspace_size);\n\n  size_t filters_workspace_size = 0;\n  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(\n      handle, features_desc.desc(), output_grad_desc.desc(),\n      indice_pairs_desc.desc(), filters_grad_desc.desc(), indice_num, _inverse,\n      _subM, &filters_workspace_size));\n\n  auto indice_convbpdata_workspace =\n      at::empty(data_workspace_size, features.options().dtype(at::kByte));\n  auto indice_convbpfilter_workspace =\n      at::empty(filters_workspace_size, filters.options().dtype(at::kByte));\n\n  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);\n  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);\n  auto output_grad_impl = torch_mlu::getMluTensorImpl(output_grad_contiguous);\n  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);\n  auto indice_convbpdata_workspace_impl =\n      torch_mlu::getMluTensorImpl(indice_convbpdata_workspace);\n  auto indice_convbpfilter_workspace_impl =\n      torch_mlu::getMluTensorImpl(indice_convbpfilter_workspace);\n\n  auto features_ptr = features_impl->cnnlMalloc();\n  auto filters_ptr = filters_impl->cnnlMalloc();\n  auto output_grad_ptr = output_grad_impl->cnnlMalloc();\n  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();\n  auto indice_convbpdata_workspace_ptr =\n      indice_convbpdata_workspace_impl->cnnlMalloc();\n  auto indice_convbpfilter_workspace_ptr =\n      indice_convbpfilter_workspace_impl->cnnlMalloc();\n\n  // outputs\n  auto input_grad_impl = torch_mlu::getMluTensorImpl(input_grad);\n  auto input_grad_ptr = input_grad_impl->cnnlMalloc();\n  auto filters_grad_impl = torch_mlu::getMluTensorImpl(filters_grad);\n  auto filters_grad_ptr = filters_grad_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardData(\n      handle, output_grad_desc.desc(), output_grad_ptr, filters_desc.desc(),\n      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,\n      _inverse, _subM, indice_convbpdata_workspace_ptr, data_workspace_size,\n      input_grad_desc.desc(), input_grad_ptr));\n\n  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardFilter(\n      handle, features_desc.desc(), features_ptr, output_grad_desc.desc(),\n      output_grad_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,\n      _inverse, _subM, indice_convbpfilter_workspace_ptr,\n      filters_workspace_size, filters_grad_desc.desc(), filters_grad_ptr));\n\n  std::vector<torch::Tensor> result;\n  result.push_back(input_grad);\n  result.push_back(filters_grad);\n  return result;\n}\n\ntorch::Tensor indice_conv_forward_mlu(torch::Tensor features,\n                                      torch::Tensor filters,\n                                      torch::Tensor indicePairs,\n                                      torch::Tensor indiceNum,\n                                      int64_t numActOut, int64_t _inverse,\n                                      int64_t _subM) {\n  return IndiceConvForwardMLUKernelLauncher(\n      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);\n}\n\nstd::vector<torch::Tensor> indice_conv_backward_mlu(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  return IndiceConvBackwardMLUKernelLauncher(\n      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);\n}\n\ntorch::Tensor indice_conv_forward_impl(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM);\n\nstd::vector<torch::Tensor> indice_conv_backward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM);\n\nREGISTER_DEVICE_IMPL(indice_conv_forward_impl, MLU, indice_conv_forward_mlu);\nREGISTER_DEVICE_IMPL(indice_conv_backward_impl, MLU, indice_conv_backward_mlu);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<2>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<3>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<4>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,\n                              const Tensor known, Tensor dist2, Tensor idx) {\n  auto unknown_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      unknown, unknown.suggest_memory_format());\n  auto known_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      known, known.suggest_memory_format());\n  auto dist2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      dist2, dist2.suggest_memory_format());\n  auto idx_contiguous =\n      torch_mlu::cnnl::ops::cnnl_contiguous(idx, idx.suggest_memory_format());\n\n  MluOpTensorDescriptor unknown_desc, known_desc, dist2_desc, idx_desc;\n  unknown_desc.set(unknown_contiguous);\n  known_desc.set(known_contiguous);\n  dist2_desc.set(dist2_contiguous);\n  idx_desc.set(idx_contiguous);\n\n  auto handle = mluOpGetCurrentHandle();\n  size_t workspace_size = 0;\n  TORCH_MLUOP_CHECK(mluOpGetThreeNNForwardWorkspaceSize(\n      handle, known_desc.desc(), &workspace_size));\n  auto known_workspace =\n      at::empty(workspace_size, known.options().dtype(at::kByte));\n\n  auto unknown_impl = torch_mlu::getMluTensorImpl(unknown_contiguous);\n  auto known_impl = torch_mlu::getMluTensorImpl(known_contiguous);\n  auto dist2_impl = torch_mlu::getMluTensorImpl(dist2_contiguous);\n  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);\n  auto workspace_impl = torch_mlu::getMluTensorImpl(known_workspace);\n  auto unknown_ptr = unknown_impl->cnnlMalloc();\n  auto known_ptr = known_impl->cnnlMalloc();\n  auto dist2_ptr = dist2_impl->cnnlMalloc();\n  auto idx_ptr = idx_impl->cnnlMalloc();\n  auto workspace_ptr = workspace_impl->cnnlMalloc();\n\n  TORCH_MLUOP_CHECK(mluOpThreeNNForward(\n      handle, unknown_desc.desc(), unknown_ptr, known_desc.desc(), known_ptr,\n      workspace_ptr, workspace_size, dist2_desc.desc(), dist2_ptr,\n      idx_desc.desc(), idx_ptr));\n}\n\nvoid three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,\n                          const Tensor known, Tensor dist2, Tensor idx) {\n  ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);\n}\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx);\n\nREGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\nvoid TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,\n                                      Tensor output) {\n  // params check\n  TORCH_CHECK(\n      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,\n      \"input type should be Float or Half, got \", input.scalar_type(), \".\");\n  TORCH_CHECK(input.dim() == 4, \"input should be a 4d tensor, got \",\n              input.dim(), \"d.\");\n  TORCH_CHECK(shift.dim() == 2, \"shift should be a 2d tensor, got \",\n              shift.dim(), \"d.\");\n  TORCH_CHECK(\n      input.size(0) == shift.size(0),\n      \"input batch size should be the same as shift's, input batch size is \",\n      input.size(0), \" and shift batch size is \", shift.size(0), \".\");\n  TORCH_CHECK(input.size(0) != 0, \"Input batch size should not be zero.\");\n  TORCH_CHECK(input.size(3) != 0,\n              \"The last dim size of input should not be zero.\");\n  if (input.size(1) == 0) {\n    return;\n  }\n\n  // set contiguous\n  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      input, input.suggest_memory_format());\n  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      shift, shift.suggest_memory_format());\n  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      output, output.suggest_memory_format());\n\n  // get tensor impl\n  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);\n  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);\n  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);\n\n  // get the mlu ptr\n  auto input_ptr = input_impl->cnnlMalloc();\n  auto shift_ptr = shift_impl->cnnlMalloc();\n  auto output_ptr = output_impl->cnnlMalloc();\n\n  // set tensor descriptor\n  MluOpTensorDescriptor input_desc, shift_desc, output_desc;\n  input_desc.set(input_contiguous);\n  shift_desc.set(shift_contiguous);\n  output_desc.set(output_contiguous);\n\n  // get current handle\n  auto handle = mluOpGetCurrentHandle();\n\n  TORCH_MLUOP_CHECK(mluOpTinShiftForward(handle, input_desc.desc(), input_ptr,\n                                         shift_desc.desc(), shift_ptr,\n                                         output_desc.desc(), output_ptr));\n}\n\nvoid TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,\n                                       Tensor grad_input) {\n  // params check\n  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||\n                  grad_output.scalar_type() == at::kHalf,\n              \"grad_output type should be Float or Half, got \",\n              grad_output.scalar_type(), \".\");\n  TORCH_CHECK(grad_output.dim() == 4, \"grad_output should be a 4d tensor, got \",\n              grad_output.dim(), \"d.\");\n  TORCH_CHECK(shift.dim() == 2, \"shift should be a 2d tensor, got \",\n              shift.dim(), \"d.\");\n  TORCH_CHECK(grad_output.size(0) == shift.size(0),\n              \"grad_output batch size should be the same as shift's, \"\n              \"grad_output batch size is \",\n              grad_output.size(0), \", shift batch size is \", shift.size(0),\n              \".\");\n  TORCH_CHECK(grad_output.size(0) != 0,\n              \"grad_output batch size should not be zero.\");\n  TORCH_CHECK(grad_output.size(3) != 0,\n              \"The last dim size of grad_output should not be zero.\");\n  if (grad_output.size(1) == 0) {\n    return;\n  }\n\n  // set contiguous\n  auto grad_output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_output, grad_output.suggest_memory_format());\n  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      shift, shift.suggest_memory_format());\n  auto grad_input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(\n      grad_input, grad_input.suggest_memory_format());\n\n  // get tensor impl\n  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_contiguous);\n  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);\n  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_contiguous);\n\n  // get the mlu ptr\n  auto grad_output_ptr = grad_output_impl->cnnlMalloc();\n  auto shift_ptr = shift_impl->cnnlMalloc();\n  auto grad_input_ptr = grad_input_impl->cnnlMalloc();\n\n  // set tensor descriptor\n  MluOpTensorDescriptor grad_output_desc, shift_desc, grad_input_desc;\n  grad_output_desc.set(grad_output_contiguous);\n  shift_desc.set(shift_contiguous);\n  grad_input_desc.set(grad_input_contiguous);\n\n  // get current handle\n  auto handle = mluOpGetCurrentHandle();\n\n  TORCH_MLUOP_CHECK(mluOpTinShiftBackward(\n      handle, grad_output_desc.desc(), grad_output_ptr, shift_desc.desc(),\n      shift_ptr, grad_input_desc.desc(), grad_input_ptr));\n}\n\nvoid tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {\n  TINShiftForwardMLUKernelLauncher(input, shift, output);\n}\n\nvoid tin_shift_backward_mlu(Tensor grad_output, Tensor shift,\n                            Tensor grad_input) {\n  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);\n\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input);\n\nREGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);\nREGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp",
    "content": "/*************************************************************************\n * Copyright (C) 2022 by Cambricon.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n *************************************************************************/\n#include \"mlu_common_helper.h\"\n\n/*************************************************************************\n * This MACRO contains operations of simple tensor to mlu-tensor.\n * _contiguous, _desc, _impl, _ptr will be automatically generated in\n * this MACRO.\n *************************************************************************/\n#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \\\n  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \\\n      NAME, NAME.suggest_memory_format());                          \\\n  MluOpTensorDescriptor NAME##_desc;                                \\\n  NAME##_desc.set(NAME##_contigous);                                \\\n  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \\\n  auto NAME##_ptr = NAME##_impl->cnnlMalloc();\n\nint HardVoxelizeForwardMLUKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  std::vector<float> _voxel_size(voxel_size.begin(), voxel_size.end());\n  std::vector<float> _coors_range(coors_range.begin(), coors_range.end());\n  auto opts = torch::TensorOptions().dtype(torch::kFloat32);\n  auto voxel_size_tensor =\n      torch::from_blob(_voxel_size.data(), {int64_t(_voxel_size.size())}, opts)\n          .clone()\n          .to(at::kMLU);\n  auto coors_range_tensor =\n      torch::from_blob(_coors_range.data(), {int64_t(_coors_range.size())},\n                       opts)\n          .clone()\n          .to(at::kMLU);\n  INITIAL_MLU_PARAM_WITH_TENSOR(points);\n  INITIAL_MLU_PARAM_WITH_TENSOR(voxels);\n  INITIAL_MLU_PARAM_WITH_TENSOR(coors);\n  INITIAL_MLU_PARAM_WITH_TENSOR(num_points_per_voxel);\n  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_size_tensor);\n  INITIAL_MLU_PARAM_WITH_TENSOR(coors_range_tensor);\n\n  auto voxel_num_tensor = at::empty({1}, points.options().dtype(torch::kInt32));\n  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num_tensor);\n\n  size_t workspace_size;\n  auto handle = mluOpGetCurrentHandle();\n  TORCH_MLUOP_CHECK(mluOpGetVoxelizationWorkspaceSize(\n      handle, points_desc.desc(), voxel_size_tensor_desc.desc(),\n      coors_range_tensor_desc.desc(), max_points, max_voxels, NDim, true,\n      voxels_desc.desc(), coors_desc.desc(), num_points_per_voxel_desc.desc(),\n      voxel_num_tensor_desc.desc(), &workspace_size));\n  auto workspace_tensor =\n      at::empty(workspace_size, points.options().dtype(at::kByte));\n  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);\n\n  TORCH_MLUOP_CHECK(mluOpVoxelization(\n      handle, points_desc.desc(), points_ptr, voxel_size_tensor_desc.desc(),\n      voxel_size_tensor_ptr, coors_range_tensor_desc.desc(),\n      coors_range_tensor_ptr, max_points, max_voxels, NDim, true,\n      workspace_tensor_ptr, workspace_size, voxels_desc.desc(), voxels_ptr,\n      coors_desc.desc(), coors_ptr, num_points_per_voxel_desc.desc(),\n      num_points_per_voxel_ptr, voxel_num_tensor_desc.desc(),\n      voxel_num_tensor_ptr));\n  auto voxel_num_cpu = voxel_num_tensor.to(at::kCPU);\n  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];\n  return voxel_num_int;\n}\n\nint hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels,\n                              at::Tensor &coors,\n                              at::Tensor &num_points_per_voxel,\n                              const std::vector<float> voxel_size,\n                              const std::vector<float> coors_range,\n                              const int max_points, const int max_voxels,\n                              const int NDim) {\n  return HardVoxelizeForwardMLUKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n}\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim);\n\nREGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MLU,\n                     hard_voxelize_forward_mlu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n#include <torch/csrc/utils/pybind.h>\n\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n#include \"csrc_dipu/runtime/device/deviceapis.h\"\n#include \"csrc_dipu/utils/helpfunc.hpp\"\n\nusing dipu::VENDOR_TYPE;\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,\n                       data_mask, batch_size, channels, height_im, width_im,\n                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,\n                       stride_h, stride_w, dilation_h, dilation_w,\n                       deformable_group, data_col);\n}\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,\n                       data_mask, batch_size, channels, height_im, width_im,\n                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,\n                       stride_h, stride_w, dilation_h, dilation_w,\n                       deformable_group, grad_im);\n}\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask) {\n  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,\n                       data_im, data_offset, data_mask, batch_size, channels,\n                       height_im, width_im, height_col, width_col, kernel_h,\n                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,\n                       dilation_w, deformable_group, grad_offset, grad_mask);\n}\n\nvoid modulated_deform_conv_forward_fallthrough(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias) {\n  at::DeviceGuard guard(input.device());\n\n  const int batch = input.size(0);\n  const int channels = input.size(1);\n  const int height = input.size(2);\n  const int width = input.size(3);\n\n  const int channels_out = weight.size(0);\n  const int channels_kernel = weight.size(1);\n  const int kernel_h_ = weight.size(2);\n  const int kernel_w_ = weight.size(3);\n\n  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)\n    AT_ERROR(\"Input shape and kernel shape won't match: (%d x %d vs %d x %d).\",\n             kernel_h_, kernel_w, kernel_h_, kernel_w_);\n  if (channels != channels_kernel * group)\n    AT_ERROR(\"Input shape and kernel channels won't match: (%d vs %d).\",\n             channels, channels_kernel * group);\n\n  const int height_out =\n      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;\n  const int width_out =\n      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < height_out * width_out) {\n    // Resize plane and fill with ones...\n    ones = at::ones({height_out, width_out}, input.options());\n  }\n\n  // resize output\n  output = output.view({batch, channels_out, height_out, width_out}).zero_();\n  // resize temporary columns\n  columns =\n      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},\n                input.options());\n\n  output = output.view({output.size(0), group, output.size(1) / group,\n                        output.size(2), output.size(3)});\n\n  for (int b = 0; b < batch; b++) {\n    modulated_deformable_im2col_impl(\n        input[b], offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, columns);\n\n    // divide into group\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n\n    for (int g = 0; g < group; g++) {\n      output[b][g] = output[b][g]\n                         .flatten(1)\n                         .addmm_(weight[g].flatten(1), columns[g])\n                         .view_as(output[b][g]);\n    }\n\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n  }\n\n  output = output.view({output.size(0), output.size(1) * output.size(2),\n                        output.size(3), output.size(4)});\n\n  if (with_bias) {\n    output += bias.view({1, bias.size(0), 1, 1});\n  }\n}\n\nvoid modulated_deform_conv_backward_fallthrough(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias) {\n  at::DeviceGuard guard(input.device());\n\n  const int batch = input.size(0);\n  const int channels = input.size(1);\n  const int height = input.size(2);\n  const int width = input.size(3);\n\n  const int channels_kernel = weight.size(1);\n  const int kernel_h_ = weight.size(2);\n  const int kernel_w_ = weight.size(3);\n  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)\n    AT_ERROR(\"Input shape and kernel shape won't match: (%d x %d vs %d x %d).\",\n             kernel_h_, kernel_w, kernel_h_, kernel_w_);\n  if (channels != channels_kernel * group)\n    AT_ERROR(\"Input shape and kernel channels won't match: (%d vs %d).\",\n             channels, channels_kernel * group);\n\n  const int height_out =\n      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;\n  const int width_out =\n      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;\n\n  if (ones.ndimension() != 2 ||\n      ones.size(0) * ones.size(1) < height_out * width_out) {\n    // Resize plane and fill with ones...\n    ones = at::ones({height_out, width_out}, input.options());\n  }\n\n  grad_input = grad_input.view({batch, channels, height, width});\n  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},\n                      input.options());\n\n  grad_output =\n      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,\n                        grad_output.size(2), grad_output.size(3)});\n\n  for (int b = 0; b < batch; b++) {\n    // divide int group\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    weight = weight.view({group, weight.size(0) / group, weight.size(1),\n                          weight.size(2), weight.size(3)});\n\n    for (int g = 0; g < group; g++) {\n      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),\n                        grad_output[b][g].flatten(1), 0.0f, 1.0f);\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),\n                          weight.size(3), weight.size(4)});\n\n    // gradient w.r.t. input coordinate data\n    modulated_deformable_col2im_coord_impl(\n        columns, input[b], offset[b], mask[b], 1, channels, height, width,\n        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,\n        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],\n        grad_mask[b]);\n    // gradient w.r.t. input data\n    modulated_deformable_col2im_impl(\n        columns, offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, grad_input[b]);\n\n    // gradient w.r.t. weight, dWeight should accumulate across the batch and\n    // group\n    modulated_deformable_im2col_impl(\n        input[b], offset[b], mask[b], 1, channels, height, width, height_out,\n        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n        dilation_h, dilation_w, deformable_group, columns);\n\n    columns = columns.view({group, columns.size(0) / group, columns.size(1)});\n    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,\n                                    grad_weight.size(1), grad_weight.size(2),\n                                    grad_weight.size(3)});\n    if (with_bias)\n      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});\n\n    for (int g = 0; g < group; g++) {\n      grad_weight[g] =\n          grad_weight[g]\n              .flatten(1)\n              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))\n              .view_as(grad_weight[g]);\n      if (with_bias) {\n        grad_bias[g] =\n            grad_bias[g]\n                .view({-1, 1})\n                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))\n                .view(-1);\n      }\n    }\n\n    columns =\n        columns.view({columns.size(0) * columns.size(1), columns.size(2)});\n    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),\n                                    grad_weight.size(2), grad_weight.size(3),\n                                    grad_weight.size(4)});\n    if (with_bias)\n      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});\n  }\n  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),\n                                  grad_output.size(2), grad_output.size(3),\n                                  grad_output.size(4)});\n}\n\n#ifdef MMCV_WITH_DIOPI\nvoid modulated_deform_conv_forward_diopi(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias) {\n  auto input_p = toDiopiTensorHandle(input);\n  diopiDevice_t device;\n  diopiGetTensorDevice(input_p, &device);\n  if (device == diopi_host) {\n    modulated_deform_conv_forward_fallthrough(\n        input, weight, bias, ones, offset, mask, output, columns, kernel_h,\n        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,\n        group, deformable_group, with_bias);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto weight_p = toDiopiTensorHandle(weight);\n  auto bias_p = toDiopiTensorHandle(bias);\n  auto ones_p = toDiopiTensorHandle(ones);\n  auto offset_p = toDiopiTensorHandle(offset);\n  auto mask_p = toDiopiTensorHandle(mask);\n  auto output_p = toDiopiTensorHandle(output);\n  auto columns_p = toDiopiTensorHandle(columns);\n  if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiModulatedDeformConvMmcv(\n          ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,\n          mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,\n          dilation_h, dilation_w, group, deformable_group, with_bias);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiModulatedDeformConvMmcv(\n          ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,\n          mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,\n          dilation_h, dilation_w, group, deformable_group, with_bias);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op modulated_deform_conv_forward\";\n  auto input_cpu = input.cpu();\n  auto weight_cpu = weight.cpu();\n  auto bias_cpu = bias.cpu();\n  auto ones_cpu = ones.cpu();\n  auto offset_cpu = offset.cpu();\n  auto mask_cpu = mask.cpu();\n  auto output_cpu = output.cpu();\n  auto columns_cpu = columns.cpu();\n  modulated_deform_conv_forward_fallthrough(\n      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,\n      output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h,\n      pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);\n  output.copy_(output_cpu);\n  return;\n}\n\nvoid modulated_deform_conv_backward_diopi(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias) {\n  auto input_p = toDiopiTensorHandle(input);\n  diopiDevice_t device;\n  diopiGetTensorDevice(input_p, &device);\n  if (device == diopi_host) {\n    modulated_deform_conv_backward_fallthrough(\n        input, weight, bias, ones, offset, mask, columns, grad_input,\n        grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,\n        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,\n        group, deformable_group, with_bias);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto weight_p = toDiopiTensorHandle(weight);\n  auto bias_p = toDiopiTensorHandle(bias);\n  auto ones_p = toDiopiTensorHandle(ones);\n  auto offset_p = toDiopiTensorHandle(offset);\n  auto mask_p = toDiopiTensorHandle(mask);\n  auto columns_p = toDiopiTensorHandle(columns);\n  auto grad_input_p = toDiopiTensorHandle(grad_input);\n  auto grad_weight_p = toDiopiTensorHandle(grad_weight);\n  auto grad_bias_p = toDiopiTensorHandle(grad_bias);\n  auto grad_offset_p = toDiopiTensorHandle(grad_offset);\n  auto grad_mask_p = toDiopiTensorHandle(grad_mask);\n  auto grad_output_p = toDiopiTensorHandle(grad_output);\n\n  if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=\n      nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiModulatedDeformConvBackwardMmcv(\n          ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,\n          grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,\n          columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,\n          pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,\n          with_bias);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiModulatedDeformConvBackwardMmcv(\n          ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,\n          grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,\n          columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w,\n          pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,\n          with_bias);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op modulated_deform_conv_forward\";\n  auto input_cpu = input.cpu();\n  auto weight_cpu = weight.cpu();\n  auto bias_cpu = bias.cpu();\n  auto ones_cpu = ones.cpu();\n  auto offset_cpu = offset.cpu();\n  auto mask_cpu = mask.cpu();\n  auto columns_cpu = columns.cpu();\n  auto grad_input_cpu = grad_input.cpu();\n  auto grad_weight_cpu = grad_weight.cpu();\n  auto grad_bias_cpu = grad_bias.cpu();\n  auto grad_offset_cpu = grad_offset.cpu();\n  auto grad_mask_cpu = grad_mask.cpu();\n  auto grad_output_cpu = grad_output.cpu();\n  modulated_deform_conv_backward_fallthrough(\n      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,\n      columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu,\n      grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w,\n      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n  grad_input.copy_(grad_input_cpu);\n  grad_weight.copy_(grad_weight_cpu);\n  grad_bias.copy_(grad_bias_cpu);\n  grad_offset.copy_(grad_offset_cpu);\n  grad_mask.copy_(grad_mask_cpu);\n  return;\n}\n#endif\n\nvoid modulated_deform_conv_forward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias) {\n#ifdef MMCV_WITH_DIOPI\n  modulated_deform_conv_forward_diopi(\n      input, weight, bias, ones, offset, mask, output, columns, kernel_h,\n      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n#else\n  modulated_deform_conv_forward_fallthrough(\n      input, weight, bias, ones, offset, mask, output, columns, kernel_h,\n      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n#endif\n}\n\nvoid modulated_deform_conv_backward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias) {\n#ifdef MMCV_WITH_DIOPI\n  modulated_deform_conv_backward_diopi(\n      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,\n      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,\n      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n#else\n  modulated_deform_conv_backward_fallthrough(\n      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,\n      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,\n      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,\n      deformable_group, with_bias);\n#endif\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"pytorch_device_registry.hpp\"\n\n#include \"MPSLibrary.h\"\n#include \"MPSStream.h\"\n#include \"MPSUtils.h\"\n\nusing at::Tensor;\n\nconst static std::string kSourceCode = R\"(\n#include <metal_math>\n#include <metal_stdlib>\nusing namespace metal;\n\nkernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,\n                       constant const float4* bboxes2,\n                       device float* ious,\n                       constant int& num_bbox1,\n                       constant int& num_bbox2,\n                       constant int& mode,\n                       constant bool& aligned,\n                       constant int& offset,\n                       uint index [[thread_position_in_grid]])\n{\n    int base1 = index;\n    int base2 = index;\n    if(!aligned){\n      base1 = index / num_bbox2;\n      base2 = index % num_bbox2;\n    }\n\n    const float f_offset = float(offset);\n\n    const float4 b1 = bboxes1[base1];\n    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);\n\n    const float4 b2 = bboxes2[base2];\n    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);\n\n    const float2 left_top = fmax(b1.xy, b2.xy);\n    const float2 right_bottom = fmin(b1.zw, b2.zw);\n    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);\n    const float interS = wh.x * wh.y;\n\n    const float baseS =\n        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);\n    ious[index] = interS / baseS;\n}\n)\";\n\nvoid BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                                   const int mode, const bool aligned, const int offset) {\n  // get stream\n  auto stream = at::mps::getCurrentMPSStream();\n  auto library_manager = MPSLibraryManager::getInstance();\n  MPSLibrary* library;\n  const static std::string kLibraryName = \"bbox_overlap\";\n  if (library_manager->hasLibrary(kLibraryName))\n    library = library_manager->getLibrary(kLibraryName);\n  else\n    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);\n  auto func_pso = library->getComputePipelineState(\"bbox_overlap_mps_kernel\");\n\n  // create command buffer and encoder\n  MTLCommandBuffer_t command_buffer = stream->commandBuffer();\n  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];\n\n  // set pso and buffer\n  int output_size = ious.numel();\n  int num_bbox1 = bboxes1.size(0);\n  int num_bbox2 = bboxes2.size(0);\n  int num_elements = output_size;\n  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,\n             offset);\n\n  // set grid size\n  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);\n  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;\n  if (thread_group_size_x > num_elements) {\n    thread_group_size_x = num_elements;\n  }\n  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);\n\n  // encoding\n  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];\n  [compute_encoder endEncoding];\n\n  // commit, not sure if flush is required\n  stream->commit(false);\n}\n\nvoid bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,\n                       const bool aligned, const int offset) {\n  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,\n                        const bool aligned, const int offset);\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/ms_deform_attn.cpp",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nTensor ms_deform_attn_impl_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step) {\n  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,\n                              spatial_shapes, level_start_index, sampling_loc,\n                              attn_weight, im2col_step);\n}\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,\n    const int im2col_step) {\n  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,\n                       level_start_index, sampling_loc, attn_weight,\n                       grad_output, grad_value, grad_sampling_loc,\n                       grad_attn_weight, im2col_step);\n}\n\nTensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,\n                              const Tensor &level_start_index,\n                              const Tensor &sampling_loc,\n                              const Tensor &attn_weight,\n                              const int im2col_step) {\n  at::DeviceGuard guard(value.device());\n  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,\n                                     sampling_loc, attn_weight, im2col_step);\n}\n\nvoid ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,\n                             const Tensor &level_start_index,\n                             const Tensor &sampling_loc,\n                             const Tensor &attn_weight,\n                             const Tensor &grad_output, Tensor &grad_value,\n                             Tensor &grad_sampling_loc,\n                             Tensor &grad_attn_weight, const int im2col_step) {\n  at::DeviceGuard guard(value.device());\n  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,\n                               sampling_loc, attn_weight, grad_output,\n                               grad_value, grad_sampling_loc, grad_attn_weight,\n                               im2col_step);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/active_rotated_filter_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/musa/ActiveRotatingFilter_musa.cu\n#include \"active_rotated_filter_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid ActiveRotatedFilterForwardMUSAKernelLauncher(const Tensor input,\n                                                  const Tensor indices,\n                                                  Tensor output) {\n  int num_output_planes = input.size(0);\n  int num_input_planes = input.size(1);\n  int num_orientations = input.size(2);\n  int kH = input.size(3);\n  int kW = input.size(4);\n  int num_rotations = indices.size(3);\n  int nEntry = num_orientations * kH * kW;\n  int output_size = input.numel();\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      input.scalar_type(), \"active_rotated_filter_forward_musa_kernel\", [&] {\n        active_rotated_filter_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                indices.data_ptr<int>(), num_input_planes, num_output_planes,\n                num_orientations, num_rotations, nEntry,\n                output.data_ptr<scalar_t>());\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ActiveRotatedFilterBackwardMUSAKernelLauncher(const Tensor grad_out,\n                                                   const Tensor indices,\n                                                   Tensor grad_in) {\n  int num_orientations = indices.size(0);\n  int kH = indices.size(1);\n  int kW = indices.size(2);\n  int num_rotations = indices.size(3);\n  int num_output_planes = grad_out.size(0) / num_rotations;\n  int num_input_planes = grad_out.size(1) / num_orientations;\n  int nEntry = num_orientations * kH * kW;\n  int output_size = grad_in.numel();\n\n  c10::musa::MUSAGuard device_guard(indices.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_out.scalar_type(), \"active_rotated_filter_backward_musa_kernel\",\n      [&] {\n        active_rotated_filter_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_out.data_ptr<scalar_t>(),\n                indices.data_ptr<int>(), num_input_planes, num_output_planes,\n                num_orientations, num_rotations, nEntry,\n                grad_in.data_ptr<scalar_t>());\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/assign_score_withk_musa.mu",
    "content": "// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"assign_score_withk_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid AssignScoreWithKForwardMUSAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& points, const Tensor& centers, const Tensor& scores,\n    const Tensor& knn_idx, Tensor& output) {\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      points.scalar_type(), \"assign_score_withk_forward_musa_kernel\", [&] {\n        assign_score_withk_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),\n                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),\n                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid AssignScoreWithKBackwardMUSAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  c10::musa::MUSAGuard device_guard(grad_out.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));\n  dim3 threads1(THREADS_PER_BLOCK);\n  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));\n  dim3 threads2(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_out.scalar_type(), \"assign_score_withk_points_backward_musa_kernel\",\n      [&] {\n        assign_score_withk_points_backward_musa_kernel<scalar_t>\n            <<<blocks1, threads1, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),\n                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),\n                grad_points.data_ptr<scalar_t>(),\n                grad_centers.data_ptr<scalar_t>());\n      });\n\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_out.scalar_type(), \"assign_score_withk_scores_backward_musa_kernel\",\n      [&] {\n        assign_score_withk_scores_backward_musa_kernel<scalar_t>\n            <<<blocks2, threads2, 0, stream>>>(\n                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),\n                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),\n                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/ball_query_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"ball_query_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid BallQueryForwardMUSAKernelLauncher(int b, int n, int m, float min_radius,\n                                        float max_radius, int nsample,\n                                        const Tensor new_xyz, const Tensor xyz,\n                                        Tensor idx) {\n  // new_xyz: (B, M, 3)\n  // xyz: (B, N, 3)\n  // output:\n  //      idx: (B, M, nsample)\n\n  c10::musa::MUSAGuard device_guard(new_xyz.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      new_xyz.scalar_type(), \"ball_query_forward_musa_kernel\", [&] {\n        ball_query_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, n, m, min_radius, max_radius, nsample,\n                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),\n                idx.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/bbox_overlaps_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"bbox_overlaps_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\n\ntemplate <>\n__global__ void bbox_overlaps_musa_kernel<at::Half>(\n    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,\n    const int num_bbox1, const int num_bbox2, const int mode,\n    const bool aligned, const int offset) {\n  bbox_overlaps_musa_kernel_half(reinterpret_cast<const __half*>(bbox1),\n                                 reinterpret_cast<const __half*>(bbox2),\n                                 reinterpret_cast<__half*>(ious), num_bbox1,\n                                 num_bbox2, mode, aligned, offset);\n}\n\n\nvoid BBoxOverlapsMUSAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,\n                                    Tensor ious, const int mode,\n                                    const bool aligned, const int offset) {\n  int output_size = ious.numel();\n  int num_bbox1 = bboxes1.size(0);\n  int num_bbox2 = bboxes2.size(0);\n\n  c10::musa::MUSAGuard device_guard(bboxes1.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      bboxes1.scalar_type(), \"bbox_overlaps_musa_kernel\", ([&] {\n        bbox_overlaps_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),\n                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,\n                offset);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/bezier_align_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"bezier_align_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid BezierAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                          Tensor output, int aligned_height,\n                                          int aligned_width,\n                                          float spatial_scale,\n                                          int sampling_ratio, bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"bezier_align_forward_musa_kernel\", [&] {\n        bezier_align_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,\n                channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid BezierAlignBackwardMUSAKernelLauncher(\n    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,\n    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"bezier_align_backward_musa_kernel\", [&] {\n        bezier_align_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,\n                channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/bias_act_musa.mu",
    "content": "// Modified from\n// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp\n\n// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n#include <c10/util/Half.h>\n#include <musa_runtime.h>\n#include <torch/types.h>\n\n#include \"pytorch_musa_helper.hpp\"\n\nstruct bias_act_kernel_params {\n  const void *x;     // [sizeX]\n  const void *b;     // [sizeB] or NULL\n  const void *xref;  // [sizeX] or NULL\n  const void *yref;  // [sizeX] or NULL\n  const void *dy;    // [sizeX] or NULL\n  void *y;           // [sizeX]\n\n  int grad;\n  int act;\n  float alpha;\n  float gain;\n  float clamp;\n\n  int sizeX;\n  int sizeB;\n  int stepB;\n  int loopX;\n};\n\n// MUSA kernel selection.\n\ntemplate <class T>\nvoid *choose_bias_act_kernel(const bias_act_kernel_params &p);\n//------------------------------------------------------------------------\n// Helpers.\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n};\n\n//------------------------------------------------------------------------\n// MUSA kernel.\n\ntemplate <class T, int A>\n__global__ void bias_act_kernel(bias_act_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  int G = p.grad;\n  scalar_t alpha = (scalar_t)p.alpha;\n  scalar_t gain = (scalar_t)p.gain;\n  scalar_t clamp = (scalar_t)p.clamp;\n  scalar_t one = (scalar_t)1;\n  scalar_t two = (scalar_t)2;\n  scalar_t expRange = (scalar_t)80;\n  scalar_t halfExpRange = (scalar_t)40;\n  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;\n  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;\n\n  // Loop over elements.\n  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;\n  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;\n       loopIdx++, xi += blockDim.x) {\n    // Load.\n    scalar_t x = (scalar_t)((const T *)p.x)[xi];\n    scalar_t b =\n        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;\n    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;\n    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;\n    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;\n    scalar_t yy = (gain != 0) ? yref / gain : 0;\n    scalar_t y = 0;\n\n    // Apply bias.\n    ((G == 0) ? x : xref) += b;\n\n    // linear\n    if (A == 1) {\n      if (G == 0) y = x;\n      if (G == 1) y = x;\n    }\n\n    // relu\n    if (A == 2) {\n      if (G == 0) y = (x > 0) ? x : 0;\n      if (G == 1) y = (yy > 0) ? x : 0;\n    }\n\n    // lrelu\n    if (A == 3) {\n      if (G == 0) y = (x > 0) ? x : x * alpha;\n      if (G == 1) y = (yy > 0) ? x : x * alpha;\n    }\n\n    // tanh\n    if (A == 4) {\n      if (G == 0) {\n        scalar_t c = exp(x);\n        scalar_t d = one / c;\n        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);\n      }\n      if (G == 1) y = x * (one - yy * yy);\n      if (G == 2) y = x * (one - yy * yy) * (-two * yy);\n    }\n\n    // sigmoid\n    if (A == 5) {\n      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);\n      if (G == 1) y = x * yy * (one - yy);\n      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);\n    }\n\n    // elu\n    if (A == 6) {\n      if (G == 0) y = (x >= 0) ? x : exp(x) - one;\n      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);\n      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);\n    }\n\n    // selu\n    if (A == 7) {\n      if (G == 0)\n        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);\n      if (G == 1)\n        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);\n      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);\n    }\n\n    // softplus\n    if (A == 8) {\n      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);\n      if (G == 1) y = x * (one - exp(-yy));\n      if (G == 2) {\n        scalar_t c = exp(-yy);\n        y = x * c * (one - c);\n      }\n    }\n\n    // swish\n    if (A == 9) {\n      if (G == 0)\n        y = (x < -expRange) ? 0 : x / (exp(-x) + one);\n      else {\n        scalar_t c = exp(xref);\n        scalar_t d = c + one;\n        if (G == 1)\n          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);\n        else\n          y = (xref > halfExpRange)\n                  ? 0\n                  : x * c * (xref * (two - d) + two * d) / (d * d * d);\n        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;\n      }\n    }\n\n    // Apply gain.\n    y *= gain * dy;\n\n    // Clamp.\n    if (clamp >= 0) {\n      if (G == 0)\n        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;\n      else\n        y = (yref > -clamp & yref < clamp) ? y : 0;\n    }\n\n    // Store.\n    ((T *)p.y)[xi] = (T)y;\n  }\n}\n\n//------------------------------------------------------------------------\n// MUSA kernel selection.\n\ntemplate <class T>\nvoid *choose_bias_act_kernel(const bias_act_kernel_params &p) {\n  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;\n  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;\n  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;\n  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;\n  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;\n  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;\n  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;\n  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;\n  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;\n  return NULL;\n}\n\n//------------------------------------------------------------------------\n\nstatic bool has_same_layout(torch::Tensor x, torch::Tensor y) {\n  if (x.dim() != y.dim()) return false;\n  for (int64_t i = 0; i < x.dim(); i++) {\n    if (x.size(i) != y.size(i)) return false;\n    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;\n  }\n  return true;\n}\n\n//------------------------------------------------------------------------\ntorch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,\n                          const torch::Tensor &xref, const torch::Tensor &yref,\n                          const torch::Tensor &dy, int grad, int dim, int act,\n                          float alpha, float gain, float clamp) {\n  // Validate arguments.\n  TORCH_CHECK(x.is_privateuseone(), \"x must reside on MUSA device\");\n  TORCH_CHECK(\n      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),\n      \"b must have the same dtype and device as x\");\n  TORCH_CHECK(xref.numel() == 0 ||\n                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&\n                   xref.device() == x.device()),\n              \"xref must have the same shape, dtype, and device as x\");\n  TORCH_CHECK(yref.numel() == 0 ||\n                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&\n                   yref.device() == x.device()),\n              \"yref must have the same shape, dtype, and device as x\");\n  TORCH_CHECK(\n      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&\n                          dy.device() == x.device()),\n      \"dy must have the same dtype and device as x\");\n  TORCH_CHECK(x.numel() <= INT_MAX, \"x is too large\");\n  TORCH_CHECK(b.dim() == 1, \"b must have rank 1\");\n  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),\n              \"dim is out of bounds\");\n  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),\n              \"b has wrong number of elements\");\n  TORCH_CHECK(grad >= 0, \"grad must be non-negative\");\n\n  // Validate layout.\n  TORCH_CHECK(x.is_non_overlapping_and_dense(),\n              \"x must be non-overlapping and dense\");\n  TORCH_CHECK(b.is_contiguous(), \"b must be contiguous\");\n  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),\n              \"xref must have the same layout as x\");\n  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),\n              \"yref must have the same layout as x\");\n  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),\n              \"dy must have the same layout as x\");\n\n  // Create output tensor.\n  const at::musa::OptionalMUSAGuard device_guard(device_of(x));\n  torch::Tensor y = torch::empty_like(x);\n  TORCH_CHECK(has_same_layout(y, x), \"y must have the same layout as x\");\n\n  // Initialize MUSA kernel parameters.\n  bias_act_kernel_params p;\n  p.x = x.data_ptr();\n  p.b = (b.numel()) ? b.data_ptr() : NULL;\n  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;\n  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;\n  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;\n  p.y = y.data_ptr();\n  p.grad = grad;\n  p.act = act;\n  p.alpha = alpha;\n  p.gain = gain;\n  p.clamp = clamp;\n  p.sizeX = (int)x.numel();\n  p.sizeB = (int)b.numel();\n  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;\n\n  // Choose MUSA kernel.\n  void *kernel;\n  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), \"upfirdn2d_musa\", [&] {\n    kernel = choose_bias_act_kernel<scalar_t>(p);\n  });\n  TORCH_CHECK(kernel, \"no MUSA kernel found for the specified activation func\");\n\n  // Launch MUSA kernel.\n  p.loopX = 4;\n  int blockSize = 4 * 32;\n  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;\n  void *args[] = {&p};\n#ifdef MMCV_WITH_HIP\n  AT_MUSA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,\n                                c10::musa::getCurrentMUSAStream()));\n#else\n  AT_MUSA_CHECK(musaLaunchKernel(kernel, gridSize, blockSize, args, 0,\n                                 c10::musa::getCurrentMUSAStream()));\n#endif\n\n  return y;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/border_align_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"border_align_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid BorderAlignForwardMUSAKernelLauncher(const Tensor &input,\n                                          const Tensor &boxes, Tensor output,\n                                          Tensor argmax_idx,\n                                          const int pool_size) {\n  // shape assertion\n  AT_ASSERTM(input.ndimension() == 4,\n             \"non-empty 4D(batch mode) tensor expected for input feature\");\n  AT_ASSERTM(boxes.ndimension() == 3,\n             \"boxes must be 3D tensor with size of [B, H*W, 4]\");\n\n  int batch_size = input.size(0);\n  int feat_channels = input.size(1);\n  int channels = feat_channels / 4;\n  int height = input.size(2);\n  int width = input.size(3);\n  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format\n  int box_size = boxes.size(1);\n  // shape [N, channels, box_size, 4] for output\n  int nthreads = batch_size * channels * box_size;\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  dim3 block(128, 4);\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"border_align_forward_musa_kernel\", [&] {\n        border_align_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(\n                nthreads, input.data_ptr<scalar_t>(),\n                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax_idx.data_ptr<int>(), channels, box_size, height, width,\n                pool_size);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid BorderAlignBackwardMUSAKernelLauncher(const Tensor &grad_output,\n                                           const Tensor &boxes,\n                                           const Tensor &argmax_idx,\n                                           Tensor grad_input,\n                                           const int pool_size) {\n  int batch_size = grad_input.size(0);\n  int feat_channels = grad_input.size(1);\n  int channels = feat_channels / 4;\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n  int box_size = boxes.size(1);\n  int nthreads = batch_size * channels * box_size;\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  dim3 block(128, 4);\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"border_align_backward_musa_kernel\", [&] {\n        border_align_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(\n                nthreads, grad_output.data_ptr<scalar_t>(),\n                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),\n                grad_input.data_ptr<scalar_t>(), channels, box_size, height,\n                width, pool_size);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/box_iou_quadri_musa.mu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"box_iou_quadri_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid box_iou_quadri_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  using scalar_t = float;\n  AT_ASSERTM(boxes1.is_privateuseone(), \"boxes1 must be a MUSA tensor\");\n  AT_ASSERTM(boxes2.is_privateuseone(), \"boxes2 must be a MUSA tensor\");\n\n  int output_size = ious.numel();\n  int num_boxes1 = boxes1.size(0);\n  int num_boxes2 = boxes2.size(0);\n\n  c10::musa::MUSAGuard device_guard(boxes1.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  box_iou_quadri_musa_kernel<scalar_t>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),\n          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),\n          mode_flag, aligned);\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/box_iou_rotated_musa.mu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_musa.cu\n#include \"box_iou_rotated_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid box_iou_rotated_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned) {\n  using scalar_t = float;\n  AT_ASSERTM(boxes1.is_privateuseone(), \"boxes1 must be a MUSA tensor\");\n  AT_ASSERTM(boxes2.is_privateuseone(), \"boxes2 must be a MUSA tensor\");\n\n  int output_size = ious.numel();\n  int num_boxes1 = boxes1.size(0);\n  int num_boxes2 = boxes2.size(0);\n\n  c10::musa::MUSAGuard device_guard(boxes1.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  box_iou_rotated_musa_kernel<scalar_t>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),\n          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),\n          mode_flag, aligned);\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/carafe_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"carafe_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\n#include <iostream>\n\n#if MUSA_ARCH > 21\nvoid CARAFEForwardMUSAKernelLauncher(const Tensor features, const Tensor masks,\n                                     Tensor rfeatures, Tensor routput,\n                                     Tensor rmasks, Tensor output,\n                                     const int kernel_size,\n                                     const int group_size,\n                                     const int scale_factor) {\n  const int batch_size = output.size(0);\n  const int channels = output.size(1);\n  const int output_height = output.size(2);\n  const int output_width = output.size(3);\n\n  const int input_height = features.size(2);\n  const int input_width = features.size(3);\n\n  const int mask_channels = masks.size(1);\n\n  rfeatures.resize_({batch_size, input_height, input_width, channels});\n  routput.resize_({batch_size, output_height, output_width, channels});\n  rmasks.resize_({batch_size, output_height, output_width, mask_channels});\n\n  // one warp per pixel\n  c10::musa::MUSAGuard device_guard(features.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NCHW2NHWC_Feature\", ([&] {\n        const scalar_t *bottom_data = features.data_ptr<scalar_t>();\n        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();\n        const int dh = divideUP(channels, kTileDim);\n        const int dw = divideUP(input_height * input_width, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, channels, input_height * input_width, dh, dw,\n                bottom_data, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NCHW2NHWC_Masks\", ([&] {\n        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();\n        scalar_t *top_data = rmasks.data_ptr<scalar_t>();\n        const int dh = divideUP(mask_channels, kTileDim);\n        const int dw = divideUP(output_height * output_width, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, mask_channels, output_height * output_width, dh, dw,\n                bottom_data, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"CARAFELaucherForward\", ([&] {\n        const int num_kernels =\n            batch_size * output_height * output_width * THREADS_PER_PIXEL;\n        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();\n        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();\n        scalar_t *top_data = routput.data_ptr<scalar_t>();\n        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),\n                                  THREADS_PER_BLOCK, 0, stream>>>(\n            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,\n            scale_factor, channels, input_height, input_width, output_height,\n            output_width, mask_channels, top_data);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features.scalar_type(), \"NHWC2NCHW\", ([&] {\n        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n        const int dh = divideUP(output_height * output_width, kTileDim);\n        const int dw = divideUP(channels, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, output_height * output_width, channels, dh, dw,\n                bottom_data, top_data);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid CARAFEBackwardMUSAKernelLauncher(\n    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,\n    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,\n    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,\n    const int kernel_size, const int group_size, const int scale_factor) {\n  const int batch_size = top_grad.size(0);\n  const int channels = top_grad.size(1);\n  const int output_height = top_grad.size(2);\n  const int output_width = top_grad.size(3);\n\n  const int input_height = bottom_grad.size(2);\n  const int input_width = bottom_grad.size(3);\n\n  const int mask_channels = masks.size(1);\n\n  rtop_grad.resize_({batch_size, output_height, output_width, channels});\n  rbottom_grad.resize_({batch_size, input_height, input_width, channels});\n  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});\n  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});\n\n  c10::musa::MUSAGuard device_guard(top_grad.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NCHW2NHWC_Top_Grad\", ([&] {\n        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(channels, kTileDim);\n        const int dw = divideUP(output_height * output_width, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, channels, output_height * output_width, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"CARAFELaucherBackward_Feature\", ([&] {\n        const int num_kernels =\n            batch_size * output_height * output_width * THREADS_PER_PIXEL;\n        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();\n        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();\n\n        CARAFEBackward_Feature<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,\n                         group_size, scale_factor, channels, input_height,\n                         input_width, output_height, output_width,\n                         mask_channels, bottom_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"FeatureSum\", ([&] {\n        const int num_kernels =\n            batch_size * input_height * input_width * THREADS_PER_PIXEL;\n        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();\n\n        FeatureSum<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,\n                         input_height, input_width, bottom_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NHWC2NCHW_Bottom_Grad\", ([&] {\n        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(input_height * input_width, kTileDim);\n        const int dw = divideUP(channels, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, input_height * input_width, channels, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"CARAFELaucherBackward_Mask\", ([&] {\n        const int num_kernels = batch_size * output_height * output_width *\n                                mask_channels * WARP_SIZE;\n        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();\n        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();\n        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();\n\n        CARAFEBackward_Mask<scalar_t>\n            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,\n               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,\n                         group_size, scale_factor, channels, input_height,\n                         input_width, output_height, output_width,\n                         mask_channels, mask_diff);\n      }));\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"NHWC2NCHW_Mask_Grad\", ([&] {\n        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();\n        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();\n        const int dh = divideUP(output_height * output_width, kTileDim);\n        const int dw = divideUP(mask_channels, kTileDim);\n        BatchTranspose2DMUSAKernel<scalar_t>\n            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(\n                batch_size, output_height * output_width, mask_channels, dh, dw,\n                bottom_data, top_data);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n#endif //MUSA_ARCH\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/carafe_naive_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"carafe_naive_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid CARAFENAIVEForwardMUSAKernelLauncher(const Tensor features,\n                                          const Tensor masks, Tensor output,\n                                          const int kernel_size,\n                                          const int group_size,\n                                          const int scale_factor) {\n  int output_size = output.numel();\n  int channels = output.size(1);\n  int height = output.size(2);\n  int width = output.size(3);\n\n  c10::musa::MUSAGuard device_guard(features.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      features.scalar_type(), \"CARAFENAIVEForward\", ([&] {\n        carafe_naive_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, features.data_ptr<scalar_t>(),\n                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                kernel_size, group_size, scale_factor, channels, height, width);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid CARAFENAIVEBackwardMUSAKernelLauncher(\n    const Tensor top_grad, const Tensor features, const Tensor masks,\n    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,\n    const int group_size, const int scale_factor) {\n  int output_size = top_grad.numel();\n  int channels = top_grad.size(1);\n  int height = top_grad.size(2);\n  int width = top_grad.size(3);\n\n  c10::musa::MUSAGuard device_guard(top_grad.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      top_grad.scalar_type(), \"CARAFENAIVEBackward\", ([&] {\n        carafe_naive_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, top_grad.data_ptr<scalar_t>(),\n                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),\n                bottom_grad.data_ptr<scalar_t>(),\n                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,\n                scale_factor, channels, height, width);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/chamfer_distance_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp\n#include \"chamfer_distance_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n#if MUSA_ARCH > 21\nvoid ChamferDistanceForwardMUSAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,\n    const Tensor dist2, const Tensor idx1, const Tensor idx2) {\n  int batch_size = xyz1.size(0);\n  int n = xyz1.size(1);\n  int m = xyz2.size(1);\n\n  c10::musa::MUSAGuard device_guard(xyz1.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_forward_musa_kernel\", [&] {\n        chamfer_distance_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(\n                batch_size, n, xyz1.data_ptr<scalar_t>(), m,\n                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),\n                idx1.data_ptr<int>());\n      });\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz1.scalar_type(), \"chamfer_distance_forward_musa_kernel\", [&] {\n        chamfer_distance_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(\n                batch_size, m, xyz2.data_ptr<scalar_t>(), n,\n                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),\n                idx2.data_ptr<int>());\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ChamferDistanceBackwardMUSAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,\n    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {\n  int batch_size = xyz1.size(0);\n  int n = xyz1.size(1);\n  int m = xyz2.size(1);\n\n  c10::musa::MUSAGuard device_guard(xyz1.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      xyz1.scalar_type(), \"chamfer_distance_backward_musa_kernel\", [&] {\n        chamfer_distance_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                batch_size, m, xyz1.data_ptr<scalar_t>(), n,\n                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),\n                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),\n                grad_xyz2.data_ptr<scalar_t>());\n      });\n  AT_DISPATCH_FLOATING_TYPES(\n      xyz1.scalar_type(), \"chamfer_distance_backward_musa_kernel\", [&] {\n        chamfer_distance_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                batch_size, n, xyz2.data_ptr<scalar_t>(), m,\n                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),\n                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),\n                grad_xyz1.data_ptr<scalar_t>());\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n#else\n#warning \"chamfer_distance is supported when MUSA_ARCH > 21\"\n#endif  //MUSA_ARCH\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/convex_iou.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu\n#include \"convex_iou_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid ConvexIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                 Tensor ious) {\n  int output_size = ious.numel();\n  int num_pointsets = pointsets.size(0);\n  int num_polygons = polygons.size(0);\n\n  c10::musa::MUSAGuard device_guard(pointsets.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      pointsets.scalar_type(), \"convex_iou_musa_kernel\", ([&] {\n        convex_iou_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ConvexGIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                  Tensor output) {\n  int output_size = output.numel();\n  int num_pointsets = pointsets.size(0);\n  int num_polygons = polygons.size(0);\n\n  c10::musa::MUSAGuard device_guard(pointsets.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      pointsets.scalar_type(), \"convex_giou_musa_kernel\", ([&] {\n        convex_giou_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(\n                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/correlation_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_musa_kernel.cu\n// Original licence: Under MIT License\n\n#include \"correlation_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid CorrelationForwardMUSAKernelLauncher(Tensor input1, Tensor input2,\n                                          Tensor output, int kH, int kW,\n                                          int patchH, int patchW, int padH,\n                                          int padW, int dilationH,\n                                          int dilationW, int dilation_patchH,\n                                          int dilation_patchW, int dH, int dW) {\n  const int batch_size = input1.size(0);\n  const int iH = input1.size(2);\n  const int iW = input1.size(3);\n  const int dilatedKH = (kH - 1) * dilationH + 1;\n  const int dilatedKW = (kW - 1) * dilationW + 1;\n\n  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;\n  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;\n\n  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();\n  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();\n\n  const dim3 threads(WARP_SIZE, 4, 4);\n  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);\n\n  c10::musa::MUSAGuard device_guard(input1.device());\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input1.scalar_type(), \"correlation_forward_musa\", ([&] {\n        TensorAcc4R trInput1_acc =\n            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R trInput2_acc =\n            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc5R output_acc =\n            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();\n\n        correlation_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, c10::musa::getCurrentMUSAStream()>>>(\n                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,\n                padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW, oH, oW);\n      }));\n}\n\nvoid CorrelationBackwardMUSAKernelLauncher(\n    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,\n    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,\n    int padW, int dilationH, int dilationW, int dilation_patchH,\n    int dilation_patchW, int dH, int dW) {\n  const int batch_size = input1.size(0);\n  const int iH = input1.size(2);\n  const int iW = input1.size(3);\n  const int C = input1.size(1);\n\n  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();\n  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();\n  const dim3 blocks(batch_size, iH, iW);\n  const dim3 threads(THREADS_PER_BLOCK);\n\n  c10::musa::MUSAGuard device_guard(input1.device());\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input1.scalar_type(), \"correlation_backward_musa\", ([&] {\n        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);\n        TensorAcc4R input1_acc =\n            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R input2_acc =\n            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R grad_input1_acc =\n            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc4R grad_input2_acc =\n            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();\n        TensorAcc5R grad_output_acc =\n            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();\n\n        correlation_backward_musa_kernel_input1<scalar_t>\n            <<<blocks, threads, grad_cache_size,\n               c10::musa::getCurrentMUSAStream()>>>(\n                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,\n                patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW);\n\n        correlation_backward_musa_kernel_input2<scalar_t>\n            <<<blocks, threads, grad_cache_size,\n               c10::musa::getCurrentMUSAStream()>>>(\n                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,\n                patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n                dilation_patchW, dH, dW);\n      }));\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/deform_conv_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"deform_conv_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid deformable_im2col_musa(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col) {\n  // num_axes should be smaller than block size\n  // todo: check parallel_imgs is correctly passed in\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = channels * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"deformable_im2col_gpu\", ([&] {\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n\n        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),\n                                       THREADS_PER_BLOCK, 0,\n                                       c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_im_, data_offset_, height, width, ksize_h,\n            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, parallel_imgs, channels,\n            deformable_group, height_col, width_col, data_col_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid deformable_col2im_musa(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im) {\n  // todo: make sure parallel_imgs is passed in correctly\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels =\n      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;\n  int channel_per_deformable_group = channels / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),\n                                       THREADS_PER_BLOCK, 0,\n                                       c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_col_, data_offset_, channels, height, width,\n            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,\n            dilation_w, channel_per_deformable_group, parallel_imgs,\n            deformable_group, height_col, width_col, grad_im_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid deformable_col2im_coord_musa(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset) {\n  int height_col =\n      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;\n  int width_col =\n      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;\n  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *\n                    deformable_group * parallel_imgs;\n  int channel_per_deformable_group =\n      channels * ksize_h * ksize_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"deformable_col2im_coord_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n\n        deformable_col2im_coord_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_col_, data_im_, data_offset_, channels, height,\n            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,\n            2 * ksize_h * ksize_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/deform_roi_pool_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"deform_roi_pool_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid DeformRoIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                            Tensor offset, Tensor output,\n                                            int pooled_height, int pooled_width,\n                                            float spatial_scale,\n                                            int sampling_ratio, float gamma) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      input.scalar_type(), \"deform_roi_pool_forward_musa_kernel\", [&] {\n        deform_roi_pool_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio,\n                static_cast<scalar_t>(gamma), channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid DeformRoIPoolBackwardMUSAKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_output.scalar_type(), \"deform_roi_pool_backward_musa_kernel\", [&] {\n        deform_roi_pool_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),\n                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio,\n                static_cast<scalar_t>(gamma), channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/diff_iou_rotated_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Adapted from\n// https://github.com/lilanxiao/Rotated_IoU/musa_op/sort_vert_kernel.cu  # noqa\n#include \"diff_iou_rotated_musa_kernel.muh\"\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_musa_helper.hpp\"\n\nat::Tensor DiffIoURotatedSortVerticesMUSAKernelLauncher(at::Tensor vertices,\n                                                        at::Tensor mask,\n                                                        at::Tensor num_valid) {\n  c10::musa::MUSAGuard device_guard(vertices.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  CHECK_CONTIGUOUS(vertices);\n  CHECK_CONTIGUOUS(mask);\n  CHECK_CONTIGUOUS(num_valid);\n  CHECK_MUSA(vertices);\n  CHECK_MUSA(mask);\n  CHECK_MUSA(num_valid);\n\n  int b = vertices.size(0);\n  int n = vertices.size(1);\n  int m = vertices.size(2);\n  at::Tensor idx =\n      torch::zeros({b, n, MAX_NUM_VERT_IDX},\n                   at::device(vertices.device()).dtype(at::ScalarType::Int));\n\n  diff_iou_rotated_sort_vertices_forward_musa_kernel<<<b, opt_n_thread(n), 0,\n                                                       stream>>>(\n      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),\n      num_valid.data_ptr<int>(), idx.data_ptr<int>());\n  AT_MUSA_CHECK(musaGetLastError());\n\n  return idx;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/filtered_lrelu.mu",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n#include <c10/util/Half.h>\n#include <musa_runtime.h>\n#include <torch/types.h>\n\n#include <cstdint>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\n//------------------------------------------------------------------------\n// MUSA kernel parameters.\n\nstruct filtered_lrelu_kernel_params {\n  // These parameters decide which kernel to use.\n  int up;        // upsampling ratio (1, 2, 4)\n  int down;      // downsampling ratio (1, 2, 4)\n  int2 fuShape;  // [size, 1] | [size, size]\n  int2 fdShape;  // [size, 1] | [size, size]\n\n  int _dummy;  // Alignment.\n\n  // Rest of the parameters.\n  const void *x;     // Input tensor.\n  void *y;           // Output tensor.\n  const void *b;     // Bias tensor.\n  unsigned char *s;  // Sign tensor in/out. NULL if unused.\n  const float *fu;   // Upsampling filter.\n  const float *fd;   // Downsampling filter.\n\n  int2 pad0;    // Left/top padding.\n  float gain;   // Additional gain factor.\n  float slope;  // Leaky ReLU slope on negative side.\n  float clamp;  // Clamp after nonlinearity.\n  int flip;     // Filter kernel flip for gradient computation.\n\n  int tilesXdim;  // Original number of horizontal output tiles.\n  int tilesXrep;  // Number of horizontal tiles per CTA.\n  int blockZofs;  // Block z offset to support large minibatch, channel\n                  // dimensions.\n\n  int4 xShape;  // [width, height, channel, batch]\n  int4 yShape;  // [width, height, channel, batch]\n  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if\n                // unused.\n  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.\n  int swLimit;  // Active width of sign tensor in bytes.\n\n  longlong4 xStride;   // Strides of all tensors except signs, same component\n                       // order as shapes.\n  longlong4 yStride;   //\n  int64_t bStride;     //\n  longlong3 fuStride;  //\n  longlong3 fdStride;  //\n};\n\nstruct filtered_lrelu_act_kernel_params {\n  void *x;           // Input/output, modified in-place.\n  unsigned char *s;  // Sign tensor in/out. NULL if unused.\n\n  float gain;   // Additional gain factor.\n  float slope;  // Leaky ReLU slope on negative side.\n  float clamp;  // Clamp after nonlinearity.\n\n  int4 xShape;        // [width, height, channel, batch]\n  longlong4 xStride;  // Input/output tensor strides, same order as in shape.\n  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if\n                // unused.\n  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.\n};\n\n//------------------------------------------------------------------------\n// MUSA kernel specialization.\n\nstruct filtered_lrelu_kernel_spec {\n  void *setup;   // Function for filter kernel setup.\n  void *exec;    // Function for main operation.\n  int2 tileOut;  // Width/height of launch tile.\n  int numWarps;  // Number of warps per thread block, determines launch block\n                 // size.\n  int xrep;      // For processing multiple horizontal tiles per thread block.\n  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.\n};\n\n//------------------------------------------------------------------------\n// MUSA kernel selection.\n\ntemplate <class T, class index_t, bool signWrite, bool signRead>\nfiltered_lrelu_kernel_spec choose_filtered_lrelu_kernel(\n    const filtered_lrelu_kernel_params &p, int sharedKB);\ntemplate <class T, bool signWrite, bool signRead>\nvoid *choose_filtered_lrelu_act_kernel(void);\n\n//------------------------------------------------------------------------\n// Helpers.\n\nenum              // Filter modes.\n{ MODE_SUSD = 0,  // Separable upsampling, separable downsampling.\n  MODE_FUSD = 1,  // Full upsampling, separable downsampling.\n  MODE_SUFD = 2,  // Separable upsampling, full downsampling.\n  MODE_FUFD = 3,  // Full upsampling, full downsampling.\n};\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n  typedef double2 vec2_t;\n  typedef double4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_double2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_double4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static double clamp(double x, double c) {\n    return fmin(fmax(x, -c), c);\n  }\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n  typedef float2 vec2_t;\n  typedef float4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_float2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_float4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static float clamp(float x, float c) {\n    return fminf(fmaxf(x, -c), c);\n  }\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n  typedef float2 vec2_t;\n  typedef float4 vec4_t;\n  __device__ __forceinline__ static vec2_t zero_vec2(void) {\n    return make_float2(0, 0);\n  }\n  __device__ __forceinline__ static vec4_t zero_vec4(void) {\n    return make_float4(0, 0, 0, 0);\n  }\n  __device__ __forceinline__ static float clamp(float x, float c) {\n    return fminf(fmaxf(x, -c), c);\n  }\n};\n\n#define MIN(A, B) ((A) < (B) ? (A) : (B))\n#define MAX(A, B) ((A) > (B) ? (A) : (B))\n#define CEIL_DIV(A, B)                                   \\\n  (((B) == 1)                                            \\\n       ? (A)                                             \\\n       : ((B) == 2) ? ((int)((A) + 1) >> 1)              \\\n                    : ((B) == 4) ? ((int)((A) + 3) >> 2) \\\n                                 : (((A) + ((A) > 0 ? (B)-1 : 0)) / (B)))\n\n// This works only up to blocks of size 256 x 256 and for all N that are powers\n// of two.\ntemplate <int N>\n__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {\n  if ((N & (N - 1)) && N <= 256)\n    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.\n  else\n    y = i / N;\n\n  x = i - y * N;\n}\n\n// Type cast stride before reading it.\ntemplate <class T>\n__device__ __forceinline__ T get_stride(const int64_t &x) {\n  return *reinterpret_cast<const T *>(&x);\n}\n\n//------------------------------------------------------------------------\n// Filters, setup kernel, copying function.\n\n#define MAX_FILTER_SIZE 32\n\n// Combined up/down filter buffers so that transfer can be done with one copy.\n__device__ float\n    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,\n                                                    // written by setup kernel.\n__device__ __constant__ float\n    c_fbuf[2 * MAX_FILTER_SIZE *\n           MAX_FILTER_SIZE];  // Filters in constant memory, read by main\n                              // kernel.\n\n// Accessors to combined buffers to index up/down filters individually.\n#define c_fu (c_fbuf)\n#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)\n#define g_fu (g_fbuf)\n#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)\n\n// Set up filters into global memory buffer.\nstatic __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {\n  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;\n       idx += blockDim.x) {\n    int x, y;\n    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);\n\n    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);\n    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);\n    if (p.fuShape.y > 0)\n      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)\n                      ? 0.0f\n                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];\n    else\n      g_fu[idx] =\n          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];\n\n    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);\n    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);\n    if (p.fdShape.y > 0)\n      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)\n                      ? 0.0f\n                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];\n    else\n      g_fd[idx] =\n          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];\n  }\n}\n\n// Host function to copy filters written by setup kernel into constant buffer\n// for main kernel.\nstatic musaError_t copy_filters(musaStream_t stream) {\n  void *src = 0;\n  musaError_t err = musaGetSymbolAddress(&src, g_fbuf);\n  if (err) return err;\n  return musaMemcpyToSymbolAsync(\n      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,\n      musaMemcpyDeviceToDevice, stream);\n}\n\n//------------------------------------------------------------------------\n// Coordinate spaces:\n// - Relative to input tensor:      inX, inY, tileInX, tileInY\n// - Relative to input tile:        relInX, relInY, tileInW, tileInH\n// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH\n// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH\n// - Relative to output tensor:     outX, outY, tileOutX, tileOutY\n//\n// Relationships between coordinate spaces:\n// - inX = tileInX + relInX\n// - inY = tileInY + relInY\n// - relUpX = relInX * up + phaseInX\n// - relUpY = relInY * up + phaseInY\n// - relUpX = relOutX * down\n// - relUpY = relOutY * down\n// - outX = tileOutX + relOutX\n// - outY = tileOutY + relOutY\n\nextern __shared__ char\n    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically\n                  // inside the kernel, otherwise use the externally allocated\n                  // shared memory buffer.\n\ntemplate <class T, class index_t, int sharedKB, bool signWrite, bool signRead,\n          int filterMode, int up, int fuSize, int down, int fdSize,\n          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,\n          bool enableWriteSkip>\nstatic __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {\n  // Check that we don't try to support non-existing filter modes.\n  static_assert(up == 1 || up == 2 || up == 4,\n                \"only up=1, up=2, up=4 scales supported\");\n  static_assert(down == 1 || down == 2 || down == 4,\n                \"only down=1, down=2, down=4 scales supported\");\n  static_assert(fuSize >= up,\n                \"upsampling filter size must be at least upsampling factor\");\n  static_assert(\n      fdSize >= down,\n      \"downsampling filter size must be at least downsampling factor\");\n  static_assert(\n      fuSize % up == 0,\n      \"upsampling filter size must be divisible with upsampling factor\");\n  static_assert(\n      fdSize % down == 0,\n      \"downsampling filter size must be divisible with downsampling factor\");\n  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,\n                \"filter size greater than MAX_FILTER_SIZE\");\n  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||\n                                            filterMode == MODE_FUSD)),\n                \"up=1 supported only for 1x1 full filters\");\n  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||\n                                              filterMode == MODE_SUFD)),\n                \"down=1 supported only for 1x1 full filters\");\n  static_assert(\n      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),\n      \"full filters not supported for up=4\");\n  static_assert(\n      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),\n      \"full filters not supported for down=4\");\n\n  // Static definitions.\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  typedef typename InternalType<T>::vec2_t vec2_t;\n  typedef typename InternalType<T>::vec4_t vec4_t;\n  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &\n                      ~3;  // Upsampled tile width, rounded up to multiple of 4.\n  const int tileUpH =\n      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.\n  const int tileInW =\n      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.\n  const int tileInH =\n      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.\n  const int tileUpH_up =\n      CEIL_DIV(tileUpH, up) *\n      up;  // Upsampled tile height rounded up to a multiple of up.\n  const int tileInH_up =\n      CEIL_DIV(tileUpH_up + (fuSize - 1),\n               up);  // For allocations only, to avoid shared memory read\n                     // overruns with up=2 and up=4.\n\n  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.\n  const bool downInline =\n      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||\n                      (up == 2 && filterMode == MODE_SUFD));\n\n  // Sizes of logical buffers.\n  const int szIn = tileInH_up * tileInW;\n  const int szUpX = tileInH_up * tileUpW;\n  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);\n  const int szDownX = tileUpH * tileOutW;\n\n  // Sizes for shared memory arrays.\n  const int s_buf0_size_base =\n      (filterMode == MODE_SUSD)\n          ? MAX(szIn, szUpXY)\n          : (filterMode == MODE_FUSD)\n                ? MAX(szIn, szDownX)\n                : (filterMode == MODE_SUFD)\n                      ? MAX(szIn, szUpXY)\n                      : (filterMode == MODE_FUFD) ? szIn : -1;\n  const int s_buf1_size_base =\n      (filterMode == MODE_SUSD)\n          ? MAX(szUpX, szDownX)\n          : (filterMode == MODE_FUSD)\n                ? szUpXY\n                : (filterMode == MODE_SUFD)\n                      ? szUpX\n                      : (filterMode == MODE_FUFD) ? szUpXY : -1;\n\n  // Ensure U128 alignment.\n  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;\n  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;\n\n  // Check at compile time that we don't use too much shared memory.\n  static_assert(\n      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),\n      \"shared memory overflow\");\n\n  // Declare shared memory arrays.\n  scalar_t *s_buf0;\n  scalar_t *s_buf1;\n  if (sharedKB <= 48) {\n    // Allocate shared memory arrays here.\n    __shared__ scalar_t\n        s_buf0_st[(sharedKB > 48)\n                      ? (1 << 24)\n                      : (s_buf0_size +\n                         s_buf1_size)];  // Prevent launching if this isn't\n                                         // optimized away when unused.\n    s_buf0 = s_buf0_st;\n    s_buf1 = s_buf0 + s_buf0_size;\n  } else {\n    // Use the dynamically allocated shared memory array.\n    s_buf0 = (scalar_t *)s_buf_raw;\n    s_buf1 = s_buf0 + s_buf0_size;\n  }\n\n  // Pointers to the buffers.\n  scalar_t *\n      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]\n  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +\n                         // relUpX]\n  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +\n                         // relUpX]\n  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW\n                          // + relOutX]\n  if (filterMode == MODE_SUSD) {\n    s_tileIn = s_buf0;\n    s_tileUpX = s_buf1;\n    s_tileUpXY = s_buf0;\n    s_tileDownX = s_buf1;\n  } else if (filterMode == MODE_FUSD) {\n    s_tileIn = s_buf0;\n    s_tileUpXY = s_buf1;\n    s_tileDownX = s_buf0;\n  } else if (filterMode == MODE_SUFD) {\n    s_tileIn = s_buf0;\n    s_tileUpX = s_buf1;\n    s_tileUpXY = s_buf0;\n  } else if (filterMode == MODE_FUFD) {\n    s_tileIn = s_buf0;\n    s_tileUpXY = s_buf1;\n  }\n\n  // Allow large grids in z direction via per-launch offset.\n  int channelIdx = blockIdx.z + p.blockZofs;\n  int batchIdx = channelIdx / p.yShape.z;\n  channelIdx -= batchIdx * p.yShape.z;\n\n  // Offset to output feature map. In bytes.\n  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +\n                      batchIdx * get_stride<index_t>(p.yStride.w);\n\n  // Sign shift amount.\n  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;\n\n// Inner tile loop.\n#pragma unroll 1\n  for (int tileIdx = 0;\n       !enableXrep ||\n       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));\n       tileIdx++) {\n    // Locate output tile.\n    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;\n    int tileOutX = tileX * tileOutW;\n    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;\n\n    // Locate input tile.\n    int tmpX = tileOutX * down - p.pad0.x;\n    int tmpY = tileOutY * down - p.pad0.y;\n    int tileInX = CEIL_DIV(tmpX, up);\n    int tileInY = CEIL_DIV(tmpY, up);\n    const int phaseInX = tileInX * up - tmpX;\n    const int phaseInY = tileInY * up - tmpY;\n\n    // Extra sync if input and output buffers are the same and we are not on\n    // first tile.\n    if (enableXrep && tileIdx > 0 &&\n        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||\n         (filterMode == MODE_FUFD && downInline)))\n      __syncthreads();\n\n    // Load input tile & apply bias. Unrolled.\n    scalar_t b =\n        (scalar_t) * (const T *)((const char *)p.b +\n                                 (channelIdx * get_stride<index_t>(p.bStride)));\n    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +\n                       batchIdx * get_stride<index_t>(p.xStride.w);\n    int idx = threadIdx.x;\n    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);\n#pragma unroll\n    for (int loop = 0; loop < loopCountIN; loop++) {\n      int relInX, relInY;\n      fast_div_mod<tileInW>(relInX, relInY, idx);\n      int inX = tileInX + relInX;\n      int inY = tileInY + relInY;\n      scalar_t v = 0;\n\n      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)\n        v = (scalar_t) * ((const T *)((const char *)p.x +\n                                      (inX * get_stride<index_t>(p.xStride.x) +\n                                       inY * get_stride<index_t>(p.xStride.y) +\n                                       mapOfsIn))) +\n            b;\n\n      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);\n      if (!skip) s_tileIn[idx] = v;\n\n      idx += threadsPerBlock;\n    }\n\n    if (filterMode == MODE_SUSD ||\n        filterMode == MODE_SUFD)  // Separable upsampling filter.\n    {\n      // Horizontal upsampling.\n      __syncthreads();\n      if (up == 4) {\n        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;\n             idx += blockDim.x * up) {\n          int relUpX0, relInY;\n          fast_div_mod<tileUpW>(relUpX0, relInY, idx);\n          int relInX0 = relUpX0 / up;\n          int src0 = relInX0 + tileInW * relInY;\n          int dst = relInY * tileUpW + relUpX0;\n          vec4_t v = InternalType<T>::zero_vec4();\n          scalar_t a = s_tileIn[src0];\n          if (phaseInX == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 3];\n              v.z += a * (scalar_t)c_fu[step * up + 2];\n              v.w += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else if (phaseInX == 1) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 3];\n              v.w += a * (scalar_t)c_fu[step * up + 2];\n            }\n          } else if (phaseInX == 2) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 2];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 3];\n            }\n          } else  // (phaseInX == 3)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 3];\n              v.y += a * (scalar_t)c_fu[step * up + 2];\n              v.z += a * (scalar_t)c_fu[step * up + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n            }\n          }\n          s_tileUpX[dst + 0] = v.x;\n          s_tileUpX[dst + 1] = v.y;\n          s_tileUpX[dst + 2] = v.z;\n          s_tileUpX[dst + 3] = v.w;\n        }\n      } else if (up == 2) {\n        bool p0 = (phaseInX == 0);\n        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;\n             idx += blockDim.x * up) {\n          int relUpX0, relInY;\n          fast_div_mod<tileUpW>(relUpX0, relInY, idx);\n          int relInX0 = relUpX0 / up;\n          int src0 = relInX0 + tileInW * relInY;\n          int dst = relInY * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n          scalar_t a = s_tileIn[src0];\n          if (p0)  // (phaseInX == 0)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else  // (phaseInX == 1)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileIn[src0 + step + 1];\n            }\n          }\n          s_tileUpX[dst + 0] = v.x;\n          s_tileUpX[dst + 1] = v.y;\n        }\n      }\n\n      // Vertical upsampling & nonlinearity.\n\n      __syncthreads();\n      int groupMask = 15 << ((threadIdx.x & 31) & ~3);\n      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH\n                          : 0;  // Skip already written signs.\n      int sShapeMaxY =\n          MIN(p.sShape.y,\n              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.\n      if (up == 4) {\n        minY -= 3;  // Adjust according to block height.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;\n             idx += blockDim.x) {\n          int relUpX, relInY0;\n          fast_div_mod<tileUpW>(relUpX, relInY0, idx);\n          int relUpY0 = relInY0 * up;\n          int src0 = relInY0 * tileUpW + relUpX;\n          int dst = relUpY0 * tileUpW + relUpX;\n          vec4_t v = InternalType<T>::zero_vec4();\n\n          scalar_t a = s_tileUpX[src0];\n          if (phaseInY == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.y += a * (scalar_t)c_fu[step * up + 3];\n              v.z += a * (scalar_t)c_fu[step * up + 2];\n              v.w += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else if (phaseInY == 1) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.z += a * (scalar_t)c_fu[step * up + 3];\n              v.w += a * (scalar_t)c_fu[step * up + 2];\n            }\n          } else if (phaseInY == 2) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 2];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n              v.z += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.w += a * (scalar_t)c_fu[step * up + 3];\n            }\n          } else  // (phaseInY == 3)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 3];\n              v.y += a * (scalar_t)c_fu[step * up + 2];\n              v.z += a * (scalar_t)c_fu[step * up + 1];\n              v.w += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n            }\n          }\n\n          int x = tileOutX * down + relUpX;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si0 =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          index_t si1 = si0 + p.sShape.x;\n          index_t si2 = si0 + p.sShape.x * 2;\n          index_t si3 = si0 + p.sShape.x * 3;\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n          v.z *= (scalar_t)((float)up * (float)up * p.gain);\n          v.w *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31 << 0;\n              int sy = __float_as_uint(v.y) >> 31 << 8;\n              int sz = __float_as_uint(v.z) >> 31 << 16;\n              int sw = __float_as_uint(v.w) >> 31 << 24;\n              if (sx) v.x *= p.slope;\n              if (sy) v.y *= p.slope;\n              if (sz) v.z *= p.slope;\n              if (sw) v.w *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2 << 0;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2 << 8;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n              if (fabsf(v.z) > p.clamp) {\n                sz = 2 << 16;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n              }\n              if (fabsf(v.w) > p.clamp) {\n                sw = 2 << 24;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                // Combine signs.\n                uint32_t s = sx + sy + sw + sz;\n                s <<= (signX & 3) << 1;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n                if ((uint32_t)(signY + 2) < sShapeMaxY) {\n                  p.s[si2] = (unsigned char)(s >> 16);\n                }\n                if ((uint32_t)(signY + 3) < sShapeMaxY) {\n                  p.s[si3] = (unsigned char)(s >> 24);\n                }\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31 << 0;\n                int sy = __float_as_uint(v.y) >> 31 << 8;\n                int sz = __float_as_uint(v.z) >> 31 << 16;\n                int sw = __float_as_uint(v.w) >> 31 << 24;\n                if (sx) v.x *= p.slope;\n                if (sy) v.y *= p.slope;\n                if (sz) v.z *= p.slope;\n                if (sw) v.w *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2 << 0;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2 << 8;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n                if (fabsf(v.z) > p.clamp) {\n                  sz = 2 << 16;\n                  v.z = InternalType<T>::clamp(v.z, p.clamp);\n                }\n                if (fabsf(v.w) > p.clamp) {\n                  sw = 2 << 24;\n                  v.w = InternalType<T>::clamp(v.w, p.clamp);\n                }\n\n                // Combine signs.\n                uint32_t s = sx + sy + sw + sz;\n                s <<= (signX & 3) << 1;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n                if ((uint32_t)(signY + 2) < sShapeMaxY) {\n                  p.s[si2] = (unsigned char)(s >> 16);\n                }\n                if ((uint32_t)(signY + 3) < sShapeMaxY) {\n                  p.s[si3] = (unsigned char)(s >> 24);\n                }\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n                if (v.z < 0.f) v.z *= p.slope;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n                if (v.w < 0.f) v.w *= p.slope;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read signs and apply.\n          {\n            if ((uint32_t)signXb < p.swLimit) {\n              int ss = (signX & 3) << 1;\n              if ((uint32_t)(signY + 0) < p.sShape.y) {\n                int s = p.s[si0] >> ss;\n                if (s & 1) v.x *= p.slope;\n                if (s & 2) v.x = 0.f;\n              }\n              if ((uint32_t)(signY + 1) < p.sShape.y) {\n                int s = p.s[si1] >> ss;\n                if (s & 1) v.y *= p.slope;\n                if (s & 2) v.y = 0.f;\n              }\n              if ((uint32_t)(signY + 2) < p.sShape.y) {\n                int s = p.s[si2] >> ss;\n                if (s & 1) v.z *= p.slope;\n                if (s & 2) v.z = 0.f;\n              }\n              if ((uint32_t)(signY + 3) < p.sShape.y) {\n                int s = p.s[si3] >> ss;\n                if (s & 1) v.w *= p.slope;\n                if (s & 2) v.w = 0.f;\n              }\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n            if (v.z < 0.f) v.z *= p.slope;\n            v.z = InternalType<T>::clamp(v.z, p.clamp);\n            if (v.w < 0.f) v.w *= p.slope;\n            v.w = InternalType<T>::clamp(v.w, p.clamp);\n          }\n\n          s_tileUpXY[dst + 0 * tileUpW] = v.x;\n          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;\n          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;\n          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;\n        }\n      } else if (up == 2) {\n        minY -= 1;  // Adjust according to block height.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;\n             idx += blockDim.x) {\n          int relUpX, relInY0;\n          fast_div_mod<tileUpW>(relUpX, relInY0, idx);\n          int relUpY0 = relInY0 * up;\n          int src0 = relInY0 * tileUpW + relUpX;\n          int dst = relUpY0 * tileUpW + relUpX;\n          vec2_t v = InternalType<T>::zero_vec2();\n\n          scalar_t a = s_tileUpX[src0];\n          if (phaseInY == 0) {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n              v.y += a * (scalar_t)c_fu[step * up + 1];\n            }\n          } else  // (phaseInY == 1)\n          {\n#pragma unroll\n            for (int step = 0; step < fuSize / up; step++) {\n              v.x += a * (scalar_t)c_fu[step * up + 1];\n              v.y += a * (scalar_t)c_fu[step * up + 0];\n              a = s_tileUpX[src0 + (step + 1) * tileUpW];\n            }\n          }\n\n          int x = tileOutX * down + relUpX;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si0 =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          index_t si1 = si0 + p.sShape.x;\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31 << 0;\n              int sy = __float_as_uint(v.y) >> 31 << 8;\n              if (sx) v.x *= p.slope;\n              if (sy) v.y *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2 << 0;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2 << 8;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                // Combine signs.\n                int s = sx + sy;\n                s <<= signXo;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31 << 0;\n                int sy = __float_as_uint(v.y) >> 31 << 8;\n                if (sx) v.x *= p.slope;\n                if (sy) v.y *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2 << 0;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2 << 8;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n\n                // Combine signs.\n                int s = sx + sy;\n                s <<= signXo;\n#ifdef MMCV_WITH_HIP\n                s |= __shfl_xor(s, 1);\n                s |= __shfl_xor(s, 2);\n#else\n                s |= __shfl_xor_sync(groupMask, s, 1);\n                s |= __shfl_xor_sync(groupMask, s, 2);\n#endif\n\n                // Write signs.\n                if ((uint32_t)(signY + 0) < sShapeMaxY) {\n                  p.s[si0] = (unsigned char)(s >> 0);\n                }\n                if ((uint32_t)(signY + 1) < sShapeMaxY) {\n                  p.s[si1] = (unsigned char)(s >> 8);\n                }\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read signs and apply.\n          {\n            if ((uint32_t)signXb < p.swLimit) {\n              if ((uint32_t)(signY + 0) < p.sShape.y) {\n                int s = p.s[si0] >> signXo;\n                if (s & 1) v.x *= p.slope;\n                if (s & 2) v.x = 0.f;\n              }\n              if ((uint32_t)(signY + 1) < p.sShape.y) {\n                int s = p.s[si1] >> signXo;\n                if (s & 1) v.y *= p.slope;\n                if (s & 2) v.y = 0.f;\n              }\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n          }\n\n          if (!downInline) {\n            // Write into temporary buffer.\n            s_tileUpXY[dst] = v.x;\n            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;\n          } else {\n            // Write directly into output buffer.\n            if ((uint32_t)x < p.yShape.x) {\n              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);\n              index_t ofs = x * get_stride<index_t>(p.yStride.x) +\n                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;\n              if ((uint32_t)y + 0 < p.yShape.y)\n                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);\n              if ((uint32_t)y + 1 < ymax)\n                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =\n                    (T)(v.y * (scalar_t)c_fd[0]);\n            }\n          }\n        }\n      }\n    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {\n      // Full upsampling filter.\n\n      if (up == 2) {\n        // 2 x 2-wide.\n        __syncthreads();\n        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y\n                            : 0;  // Skip already written signs.\n        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;\n             idx += blockDim.x * 4) {\n          int relUpX0, relUpY0;\n          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);\n          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);\n          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);\n          int src0 = relInX0 + tileInW * relInY0;\n          int tap0y = (relInY0 * up + phaseInY - relUpY0);\n\n#define X_LOOP(TAPY, PX)                                             \\\n  for (int sx = 0; sx < fuSize / up; sx++) {                         \\\n    v.x += a * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    v.z += b * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    if ((PX) == 0) {                                                 \\\n      a = b;                                                         \\\n      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \\\n    }                                                                \\\n    v.y += a * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    v.w += b * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \\\n                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \\\n    if ((PX) == 1) {                                                 \\\n      a = b;                                                         \\\n      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \\\n    }                                                                \\\n  }\n\n          vec4_t v = InternalType<T>::zero_vec4();\n          if (tap0y == 0 && phaseInX == 0)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(0, 0)\n            }\n          if (tap0y == 0 && phaseInX == 1)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(0, 1)\n            }\n          if (tap0y == 1 && phaseInX == 0)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(1, 0)\n            }\n          if (tap0y == 1 && phaseInX == 1)\n#pragma unroll\n            for (int sy = 0; sy < fuSize / up; sy++) {\n              scalar_t a = s_tileIn[src0 + sy * tileInW];\n              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];\n#pragma unroll\n              X_LOOP(1, 1)\n            }\n\n#undef X_LOOP\n\n          int x = tileOutX * down + relUpX0;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n\n          v.x *= (scalar_t)((float)up * (float)up * p.gain);\n          v.y *= (scalar_t)((float)up * (float)up * p.gain);\n          v.z *= (scalar_t)((float)up * (float)up * p.gain);\n          v.w *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write signs.\n              int sx = __float_as_uint(v.x) >> 31;\n              int sy = __float_as_uint(v.y) >> 31;\n              int sz = __float_as_uint(v.z) >> 31;\n              int sw = __float_as_uint(v.w) >> 31;\n              if (sx) v.x *= p.slope;\n              if (fabsf(v.x) > p.clamp) {\n                sx = 2;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n              }\n              if (sy) v.y *= p.slope;\n              if (fabsf(v.y) > p.clamp) {\n                sy = 2;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n              }\n              if (sz) v.z *= p.slope;\n              if (fabsf(v.z) > p.clamp) {\n                sz = 2;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n              }\n              if (sw) v.w *= p.slope;\n              if (fabsf(v.w) > p.clamp) {\n                sw = 2;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);\n              }\n            } else {\n              // Determine and write signs.\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                int sx = __float_as_uint(v.x) >> 31;\n                int sy = __float_as_uint(v.y) >> 31;\n                int sz = __float_as_uint(v.z) >> 31;\n                int sw = __float_as_uint(v.w) >> 31;\n                if (sx) v.x *= p.slope;\n                if (fabsf(v.x) > p.clamp) {\n                  sx = 2;\n                  v.x = InternalType<T>::clamp(v.x, p.clamp);\n                }\n                if (sy) v.y *= p.slope;\n                if (fabsf(v.y) > p.clamp) {\n                  sy = 2;\n                  v.y = InternalType<T>::clamp(v.y, p.clamp);\n                }\n                if (sz) v.z *= p.slope;\n                if (fabsf(v.z) > p.clamp) {\n                  sz = 2;\n                  v.z = InternalType<T>::clamp(v.z, p.clamp);\n                }\n                if (sw) v.w *= p.slope;\n                if (fabsf(v.w) > p.clamp) {\n                  sw = 2;\n                  v.w = InternalType<T>::clamp(v.w, p.clamp);\n                }\n\n                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);\n              } else {\n                // Just compute the values.\n                if (v.x < 0.f) v.x *= p.slope;\n                v.x = InternalType<T>::clamp(v.x, p.clamp);\n                if (v.y < 0.f) v.y *= p.slope;\n                v.y = InternalType<T>::clamp(v.y, p.clamp);\n                if (v.z < 0.f) v.z *= p.slope;\n                v.z = InternalType<T>::clamp(v.z, p.clamp);\n                if (v.w < 0.f) v.w *= p.slope;\n                v.w = InternalType<T>::clamp(v.w, p.clamp);\n              }\n            }\n          } else if (signRead)  // Read sign and apply.\n          {\n            if ((uint32_t)signY < p.sShape.y) {\n              int s = 0;\n              if ((uint32_t)signXb < p.swLimit) s = p.s[si];\n              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;\n              s >>= (signX & 3) << 1;\n              if (s & 0x01) v.x *= p.slope;\n              if (s & 0x02) v.x = 0.f;\n              if (s & 0x04) v.y *= p.slope;\n              if (s & 0x08) v.y = 0.f;\n              if (s & 0x10) v.z *= p.slope;\n              if (s & 0x20) v.z = 0.f;\n              if (s & 0x40) v.w *= p.slope;\n              if (s & 0x80) v.w = 0.f;\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v.x < 0.f) v.x *= p.slope;\n            v.x = InternalType<T>::clamp(v.x, p.clamp);\n            if (v.y < 0.f) v.y *= p.slope;\n            v.y = InternalType<T>::clamp(v.y, p.clamp);\n            if (v.z < 0.f) v.z *= p.slope;\n            v.z = InternalType<T>::clamp(v.z, p.clamp);\n            if (v.w < 0.f) v.w *= p.slope;\n            v.w = InternalType<T>::clamp(v.w, p.clamp);\n          }\n\n          s_tileUpXY[idx + 0] = v.x;\n          s_tileUpXY[idx + 1] = v.y;\n          s_tileUpXY[idx + 2] = v.z;\n          s_tileUpXY[idx + 3] = v.w;\n        }\n      } else if (up == 1) {\n        __syncthreads();\n        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);\n        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH\n                            : 0;  // Skip already written signs.\n        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;\n             idx += blockDim.x) {\n          int relUpX0, relUpY0;\n          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);\n          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.\n\n          int x = tileOutX * down + relUpX0;\n          int y = tileOutY * down + relUpY0;\n          int signX = x + p.sOfs.x;\n          int signY = y + p.sOfs.y;\n          int signZ = blockIdx.z + p.blockZofs;\n          int signXb = signX >> 2;\n          index_t si =\n              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);\n          v *= (scalar_t)((float)up * (float)up * p.gain);\n\n          if (signWrite) {\n            if (!enableWriteSkip) {\n              // Determine and write sign.\n              uint32_t s = 0;\n              uint32_t signXbit = (1u << signXo);\n              if (v < 0.f) {\n                s = signXbit;\n                v *= p.slope;\n              }\n              if (fabsf(v) > p.clamp) {\n                s = signXbit * 2;\n                v = InternalType<T>::clamp(v, p.clamp);\n              }\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n#ifdef MMCV_WITH_HIP\n                s += __shfl_xor(s, 1);  // Coalesce.\n                s += __shfl_xor(s, 2);  // Coalesce.\n#else\n                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.\n                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.\n#endif\n                p.s[si] = s;  // Write.\n              }\n            } else {\n              // Determine and write sign.\n              if ((uint32_t)signXb < p.swLimit &&\n                  (uint32_t)signY < p.sShape.y && signY >= minY) {\n                uint32_t s = 0;\n                uint32_t signXbit = (1u << signXo);\n                if (v < 0.f) {\n                  s = signXbit;\n                  v *= p.slope;\n                }\n                if (fabsf(v) > p.clamp) {\n                  s = signXbit * 2;\n                  v = InternalType<T>::clamp(v, p.clamp);\n                }\n#ifdef MMCV_WITH_HIP\n                s += __shfl_xor(s, 1);  // Coalesce.\n                s += __shfl_xor(s, 2);  // Coalesce.\n#else\n                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.\n                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.\n#endif\n                p.s[si] = s;  // Write.\n              } else {\n                // Just compute the value.\n                if (v < 0.f) v *= p.slope;\n                v = InternalType<T>::clamp(v, p.clamp);\n              }\n            }\n          } else if (signRead) {\n            // Read sign and apply if within sign tensor bounds.\n            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {\n              int s = p.s[si];\n              s >>= signXo;\n              if (s & 1) v *= p.slope;\n              if (s & 2) v = 0.f;\n            }\n          } else  // Forward pass with no sign write.\n          {\n            if (v < 0.f) v *= p.slope;\n            v = InternalType<T>::clamp(v, p.clamp);\n          }\n\n          if (!downInline)  // Write into temporary buffer.\n            s_tileUpXY[idx] = v;\n          else if ((uint32_t)x < p.yShape.x &&\n                   (uint32_t)y <\n                       p.yShape.y)  // Write directly into output buffer\n            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +\n                                   y * get_stride<index_t>(p.yStride.y) +\n                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);\n        }\n      }\n    }\n\n    // Downsampling.\n    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {\n      // Horizontal downsampling.\n      __syncthreads();\n      if (down == 4 && tileOutW % 4 == 0) {\n        // Calculate 4 pixels at a time.\n        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;\n             idx += blockDim.x * 4) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src0 = relUpY * tileUpW + relUpX0;\n          vec4_t v = InternalType<T>::zero_vec4();\n#pragma unroll\n          for (int step = 0; step < fdSize; step++) {\n            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];\n            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];\n            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];\n            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];\n          }\n          s_tileDownX[idx + 0] = v.x;\n          s_tileDownX[idx + 1] = v.y;\n          s_tileDownX[idx + 2] = v.z;\n          s_tileDownX[idx + 3] = v.w;\n        }\n      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {\n        // Calculate 2 pixels at a time.\n        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;\n             idx += blockDim.x * 2) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src0 = relUpY * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n#pragma unroll\n          for (int step = 0; step < fdSize; step++) {\n            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];\n            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];\n          }\n          s_tileDownX[idx + 0] = v.x;\n          s_tileDownX[idx + 1] = v.y;\n        }\n      } else {\n        // Calculate 1 pixel at a time.\n        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;\n             idx += blockDim.x) {\n          int relOutX0, relUpY;\n          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);\n          int relUpX0 = relOutX0 * down;\n          int src = relUpY * tileUpW + relUpX0;\n          scalar_t v = 0.f;\n#pragma unroll\n          for (int step = 0; step < fdSize; step++)\n            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];\n          s_tileDownX[idx] = v;\n        }\n      }\n\n      // Vertical downsampling & store output tile.\n      __syncthreads();\n      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;\n           idx += blockDim.x) {\n        int relOutX, relOutY0;\n        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);\n        int relUpY0 = relOutY0 * down;\n        int src0 = relUpY0 * tileOutW + relOutX;\n        scalar_t v = 0;\n#pragma unroll\n        for (int step = 0; step < fdSize; step++)\n          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];\n\n        int outX = tileOutX + relOutX;\n        int outY = tileOutY + relOutY0;\n\n        if (outX < p.yShape.x & outY < p.yShape.y)\n          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +\n                                 outY * get_stride<index_t>(p.yStride.y) +\n                                 mapOfsOut))) = (T)v;\n      }\n    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {\n      // Full downsampling filter.\n      if (down == 2) {\n        // 2-wide.\n        __syncthreads();\n        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;\n             idx += blockDim.x * 2) {\n          int relOutX0, relOutY0;\n          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);\n          int relUpX0 = relOutX0 * down;\n          int relUpY0 = relOutY0 * down;\n          int src0 = relUpY0 * tileUpW + relUpX0;\n          vec2_t v = InternalType<T>::zero_vec2();\n#pragma unroll\n          for (int sy = 0; sy < fdSize; sy++)\n#pragma unroll\n            for (int sx = 0; sx < fdSize; sx++) {\n              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *\n                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];\n              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *\n                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];\n            }\n\n          int outX = tileOutX + relOutX0;\n          int outY = tileOutY + relOutY0;\n          if ((uint32_t)outY < p.yShape.y) {\n            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +\n                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;\n            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;\n            if (outX + 1 < p.yShape.x)\n              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =\n                  (T)v.y;\n          }\n        }\n      } else if (down == 1 && !downInline) {\n        // Thread per pixel.\n        __syncthreads();\n        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;\n             idx += blockDim.x) {\n          int relOutX0, relOutY0;\n          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);\n          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.\n\n          int outX = tileOutX + relOutX0;\n          int outY = tileOutY + relOutY0;\n          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)\n            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +\n                                   outY * get_stride<index_t>(p.yStride.y) +\n                                   mapOfsOut))) = (T)v;\n        }\n      }\n    }\n\n    if (!enableXrep) break;\n  }\n}\n\n//------------------------------------------------------------------------\n// Compute activation function and signs for upsampled data tensor, modifying\n// data tensor in-place. Used for accelerating the generic variant. Sign tensor\n// is known to be contiguous, and p.x and p.s have the same z, w dimensions.\n// 64-bit indexing is always used.\n\ntemplate <class T, bool signWrite, bool signRead>\nstatic __global__ void filtered_lrelu_act_kernel(\n    filtered_lrelu_act_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n\n  // Indexing.\n  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;\n  int32_t qmax =\n      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.\n\n  // Loop to accommodate oversized tensors.\n  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)\n    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {\n      // Extract z and w (channel, minibatch index).\n      int32_t w = q / p.xShape.z;\n      int32_t z = q - w * p.xShape.z;\n\n      // Choose behavior based on sign read/write mode.\n      if (signWrite) {\n        // Process value if in p.x.\n        uint32_t s = 0;\n        if (x < p.xShape.x && y < p.xShape.y) {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n\n          // Gain, LReLU, clamp.\n          v *= p.gain;\n          if (v < 0.f) {\n            v *= p.slope;\n            s = 1;  // Sign.\n          }\n          if (fabsf(v) > p.clamp) {\n            v = InternalType<T>::clamp(v, p.clamp);\n            s = 2;  // Clamp.\n          }\n\n          *pv = (T)v;  // Write value.\n        }\n\n        // Coalesce into threads 0 and 16 of warp.\n        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;\n        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.\n#ifdef MMCV_WITH_HIP\n        s |= __shfl_xor(s, 1);  // Distribute.\n        s |= __shfl_xor(s, 2);\n        s |= __shfl_xor(s, 4);\n        s |= __shfl_xor(s, 8);\n#else\n        s |= __shfl_xor_sync(m, s, 1);                  // Distribute.\n        s |= __shfl_xor_sync(m, s, 2);\n        s |= __shfl_xor_sync(m, s, 4);\n        s |= __shfl_xor_sync(m, s, 8);\n#endif\n\n        // Write signs if leader and in p.s.\n        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.\n        {\n          uint64_t is =\n              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.\n          ((uint32_t *)p.s)[is >> 4] = s;\n        }\n      } else if (signRead) {\n        // Process value if in p.x.\n        if (x < p.xShape.x)  // y is always in.\n        {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n          v *= p.gain;\n\n          // Apply sign buffer offset.\n          uint32_t sx = x + p.sOfs.x;\n          uint32_t sy = y + p.sOfs.y;\n\n          // Read and apply signs if we land inside valid region of sign buffer.\n          if (sx < p.sShape.x && sy < p.sShape.y) {\n            uint64_t is =\n                (sx >> 2) + (p.sShape.x >> 2) *\n                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.\n            unsigned char s = p.s[is];\n            s >>= (sx & 3) << 1;  // Shift into place.\n            if (s & 1)            // Sign?\n              v *= p.slope;\n            if (s & 2)  // Clamp?\n              v = 0.f;\n          }\n\n          *pv = (T)v;  // Write value.\n        }\n      } else {\n        // Forward pass with no sign write. Process value if in p.x.\n        if (x < p.xShape.x)  // y is always in.\n        {\n          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +\n                       w * p.xStride.w;\n          T *pv = ((T *)p.x) + ix;\n          scalar_t v = (scalar_t)(*pv);\n          v *= p.gain;\n          if (v < 0.f) v *= p.slope;\n          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);\n          *pv = (T)v;  // Write value.\n        }\n      }\n    }\n}\n\ntemplate <class T, bool signWrite, bool signRead>\nvoid *choose_filtered_lrelu_act_kernel(void) {\n  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;\n}\n\n//------------------------------------------------------------------------\n// MUSA kernel selection.\n\ntemplate <class T, class index_t, bool signWrite, bool signRead>\nfiltered_lrelu_kernel_spec choose_filtered_lrelu_kernel(\n    const filtered_lrelu_kernel_params &p, int sharedKB) {\n  filtered_lrelu_kernel_spec s = {0};\n\n  // Return the first matching kernel.\n#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \\\n  if (sharedKB >= SH)                                                          \\\n    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \\\n        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \\\n      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \\\n          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \\\n        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \\\n            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \\\n          static_assert((D * TW % 4) == 0,                                     \\\n                        \"down * tileWidth must be divisible by 4\");            \\\n          static_assert(                                                       \\\n              FU % U == 0,                                                     \\\n              \"upscaling filter size must be multiple of upscaling factor\");   \\\n          static_assert(FD % D == 0,                                           \\\n                        \"downscaling filter size must be multiple of \"         \\\n                        \"downscaling factor\");                                 \\\n          s.setup = (void *)setup_filters_kernel;                              \\\n          s.exec = (void *)                                                    \\\n              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \\\n                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \\\n          s.tileOut = make_int2(TW, TH);                                       \\\n          s.numWarps = W;                                                      \\\n          s.xrep = XR;                                                         \\\n          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \\\n          return s;                                                            \\\n        }\n\n  // Launch parameters for various kernel specializations.\n  // Small filters must be listed before large filters, otherwise the kernel for\n  // larger filter will always match first. Kernels that use more shared memory\n  // must be listed before those that use less, for the same reason.\n\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,\n       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,\n       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,\n       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4\n  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB\n  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,\n       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4\n\n#undef CASE\n  return s;  // No kernel found.\n}\n\n//------------------------------------------------------------------------\n\n#define BUILD_FILTERED_LRELU_OP 1\n\n#ifndef MMCV_WITH_HIP\n#ifdef __GNUC__\n#if __GNUC__ < 6\n#undef BUILD_FILTERED_LRELU_OP\n#define BUILD_FILTERED_LRELU_OP 0\n#endif\n#endif\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns) {\n  // Set MUSA device.\n  TORCH_CHECK(x.is_privateuseone(), \"x must reside on MUSA device\");\n  const at::musa::OptionalMUSAGuard device_guard(device_of(x));\n\n  // Validate arguments.\n  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&\n                  b.device() == x.device(),\n              \"all input tensors must reside on the same device\");\n  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,\n              \"fu and fd must be float32\");\n  TORCH_CHECK(b.dtype() == x.dtype(), \"x and b must have the same dtype\");\n  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,\n              \"x and b must be float16 or float32\");\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&\n                  x.size(3) <= INT_MAX,\n              \"x is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x is empty\");\n  TORCH_CHECK(\n      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),\n      \"fu and fd must be rank 1 or 2\");\n  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,\n              \"fu is too large\");\n  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,\n              \"fd is too large\");\n  TORCH_CHECK(fu.numel() > 0, \"fu is empty\");\n  TORCH_CHECK(fd.numel() > 0, \"fd is empty\");\n  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),\n              \"b must be a vector with the same number of channels as x\");\n  TORCH_CHECK(up >= 1 && down >= 1, \"up and down must be at least 1\");\n\n  // Figure out how much shared memory is available on the device.\n  int maxSharedBytes = 0;\n#ifdef MMCV_WITH_HIP\n  musaDeviceGetAttribute(&maxSharedBytes,\n                         hipDeviceAttributeSharedMemPerBlockOptin,\n                         x.device().index());\n#else\n  AT_MUSA_CHECK(musaDeviceGetAttribute(&maxSharedBytes,\n                                       musaDevAttrMaxSharedMemoryPerBlockOptin,\n                                       x.device().index()));\n#endif\n  int sharedKB = maxSharedBytes >> 10;\n\n  // Populate enough launch parameters to check if a MUSA kernel exists.\n  filtered_lrelu_kernel_params p;\n  p.up = up;\n  p.down = down;\n  p.fuShape =\n      make_int2((int)fu.size(-1),\n                fu.dim() == 2 ? (int)fu.size(0)\n                              : 0);  // shape [n, 0] indicates separable filter.\n  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);\n  filtered_lrelu_kernel_spec test_spec =\n      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);\n  if (!test_spec.exec) {\n    // No kernel found - return empty tensors and indicate missing kernel with\n    // return code of -1.\n    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);\n  }\n\n  // Input/output element size.\n  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;\n\n  // Input sizes.\n  int64_t xw = (int)x.size(3);\n  int64_t xh = (int)x.size(2);\n  int64_t fut_w = (int)fu.size(-1) - 1;\n  int64_t fut_h = (int)fu.size(0) - 1;\n  int64_t fdt_w = (int)fd.size(-1) - 1;\n  int64_t fdt_h = (int)fd.size(0) - 1;\n\n  // Logical size of upsampled buffer.\n  int64_t cw = xw * up + (px0 + px1) - fut_w;\n  int64_t ch = xh * up + (py0 + py1) - fut_h;\n  TORCH_CHECK(\n      cw > fdt_w && ch > fdt_h,\n      \"upsampled buffer must be at least the size of downsampling filter\");\n  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, \"upsampled buffer is too large\");\n\n  // Compute output size and allocate.\n  int64_t yw = (cw - fdt_w + (down - 1)) / down;\n  int64_t yh = (ch - fdt_h + (down - 1)) / down;\n  TORCH_CHECK(yw > 0 && yh > 0, \"output must be at least 1x1\");\n  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, \"output is too large\");\n  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),\n                                 x.suggest_memory_format());\n\n  // Allocate sign tensor.\n  torch::Tensor so;\n  torch::Tensor s = si;\n  bool readSigns = !!s.numel();\n  int64_t sw_active = 0;  // Active width of sign tensor.\n  if (writeSigns) {\n    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.\n    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.\n    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,\n                                          // rounded up to multiple of 16.\n    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, \"signs is too large\");\n    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},\n                          x.options().dtype(torch::kUInt8),\n                          at::MemoryFormat::Contiguous);\n  } else if (readSigns)\n    sw_active = s.size(3) << 2;\n\n  // Validate sign tensor if in use.\n  if (readSigns || writeSigns) {\n    TORCH_CHECK(s.is_contiguous(), \"signs must be contiguous\");\n    TORCH_CHECK(s.dtype() == torch::kUInt8, \"signs must be uint8\");\n    TORCH_CHECK(s.device() == x.device(),\n                \"signs must reside on the same device as x\");\n    TORCH_CHECK(s.dim() == 4, \"signs must be rank 4\");\n    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),\n                \"signs must have same batch & channels as x\");\n    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,\n                \"signs is too large\");\n  }\n\n  // Populate rest of MUSA kernel parameters.\n  p.x = x.data_ptr();\n  p.y = y.data_ptr();\n  p.b = b.data_ptr();\n  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;\n  p.fu = fu.data_ptr<float>();\n  p.fd = fd.data_ptr<float>();\n  p.pad0 = make_int2(px0, py0);\n  p.gain = gain;\n  p.slope = slope;\n  p.clamp = clamp;\n  p.flip = (flip_filters) ? 1 : 0;\n  p.xShape =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.yShape =\n      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));\n  p.sShape = (readSigns || writeSigns)\n                 ? make_int2((int)s.size(3), (int)s.size(2))\n                 : make_int2(0, 0);  // Width is in bytes. Contiguous.\n  p.sOfs = make_int2(sx, sy);\n  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.\n\n  // x, y, b strides are in bytes.\n  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),\n                             sz * x.stride(1), sz * x.stride(0));\n  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),\n                             sz * y.stride(1), sz * y.stride(0));\n  p.bStride = sz * b.stride(0);\n\n  // fu, fd strides are in elements.\n  p.fuStride =\n      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);\n  p.fdStride =\n      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);\n\n  // Determine if indices don't fit in int32. Support negative strides although\n  // Torch currently never produces those.\n  bool index64b = false;\n  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;\n  if (std::min(x.size(0) * p.xStride.w, 0ll) +\n          std::min(x.size(1) * p.xStride.z, 0ll) +\n          std::min(x.size(2) * p.xStride.y, 0ll) +\n          std::min(x.size(3) * p.xStride.x, 0ll) <\n      -INT_MAX)\n    index64b = true;\n  if (std::max(x.size(0) * p.xStride.w, 0ll) +\n          std::max(x.size(1) * p.xStride.z, 0ll) +\n          std::max(x.size(2) * p.xStride.y, 0ll) +\n          std::max(x.size(3) * p.xStride.x, 0ll) >\n      INT_MAX)\n    index64b = true;\n  if (std::min(y.size(0) * p.yStride.w, 0ll) +\n          std::min(y.size(1) * p.yStride.z, 0ll) +\n          std::min(y.size(2) * p.yStride.y, 0ll) +\n          std::min(y.size(3) * p.yStride.x, 0ll) <\n      -INT_MAX)\n    index64b = true;\n  if (std::max(y.size(0) * p.yStride.w, 0ll) +\n          std::max(y.size(1) * p.yStride.z, 0ll) +\n          std::max(y.size(2) * p.yStride.y, 0ll) +\n          std::max(y.size(3) * p.yStride.x, 0ll) >\n      INT_MAX)\n    index64b = true;\n  if (s.numel() > INT_MAX) index64b = true;\n\n  // Choose MUSA kernel.\n  filtered_lrelu_kernel_spec spec = {0};\n  AT_DISPATCH_FLOATING_TYPES(\n      x.scalar_type(), \"filtered_lrelu_musa\", [&] {\n        if constexpr (sizeof(scalar_t) <=\n                      4)  // Exclude doubles. constexpr\n                          // prevents template instantiation.\n        {\n          // Choose kernel based on index type, datatype and sign read/write\n          // modes.\n          if (!index64b && writeSigns && !readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(\n                p, sharedKB);\n          else if (!index64b && !writeSigns && readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(\n                p, sharedKB);\n          else if (!index64b && !writeSigns && !readSigns)\n            spec =\n                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(\n                    p, sharedKB);\n          else if (index64b && writeSigns && !readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(\n                p, sharedKB);\n          else if (index64b && !writeSigns && readSigns)\n            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(\n                p, sharedKB);\n          else if (index64b && !writeSigns && !readSigns)\n            spec =\n                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(\n                    p, sharedKB);\n        }\n      });\n  TORCH_CHECK(\n      spec.exec,\n      \"internal error - MUSA kernel not found\")  // This should not happen\n                                                 // because we tested earlier\n                                                 // that kernel exists.\n\n  // Launch MUSA kernel.\n  void *args[] = {&p};\n  int bx = spec.numWarps * 32;\n  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;\n  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;\n  int gz = p.yShape.z * p.yShape.w;\n\n  // Repeat multiple horizontal tiles in a CTA?\n  if (spec.xrep) {\n    p.tilesXrep = spec.xrep;\n    p.tilesXdim = gx;\n\n    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;\n    std::swap(gx, gy);\n  } else {\n    p.tilesXrep = 0;\n    p.tilesXdim = 0;\n  }\n#ifdef MMCV_WITH_HIP\n  AT_MUSA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,\n                                c10::musa::getCurrentMUSAStream()));\n#else\n  // Launch filter setup kernel.\n  AT_MUSA_CHECK(musaLaunchKernel(spec.setup, 1, 1024, args, 0,\n                                 c10::musa::getCurrentMUSAStream()));\n#endif\n\n  // Copy kernels to constant memory.\n  if (writeSigns && !readSigns)\n    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));\n  else if (!writeSigns && readSigns)\n    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));\n  else if (!writeSigns && !readSigns)\n    AT_MUSA_CHECK((copy_filters(c10::musa::getCurrentMUSAStream())));\n\n  // Set cache and shared memory configurations for main kernel.\n  // FIXME:TODO FIX BUG\n  AT_MUSA_CHECK(musaFuncSetCacheConfig(spec.exec, musaFuncCachePreferShared));\n  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?\n#ifdef MMCV_WITH_HIP\n    AT_MUSA_CHECK(hipFuncSetAttribute(\n        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,\n        spec.dynamicSharedKB << 10));\n#else\n    AT_MUSA_CHECK(musaFuncSetAttribute(\n        spec.exec, musaFuncAttributeMaxDynamicSharedMemorySize,\n        spec.dynamicSharedKB << 10));\n#endif\n  // FIXME:TODO FIX BUG\n  AT_MUSA_CHECK(\n      musaFuncSetSharedMemConfig(spec.exec, musaSharedMemBankSizeFourByte));\n\n  // Launch main kernel.\n  const int maxSubGz = 65535;  // MUSA maximum for block z dimension.\n  for (int zofs = 0; zofs < gz;\n       zofs += maxSubGz)  // Do multiple launches if gz is too big.\n  {\n    p.blockZofs = zofs;\n    int subGz = std::min(maxSubGz, gz - zofs);\n// FIXME:TODO FIX BUG\n#ifdef MMCV_WITH_HIP\n    AT_MUSA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,\n                                  spec.dynamicSharedKB << 10,\n                                  c10::musa::getCurrentMUSAStream()));\n#else\n    AT_MUSA_CHECK(musaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,\n                                   spec.dynamicSharedKB << 10,\n                                   c10::musa::getCurrentMUSAStream()));\n#endif\n  }\n\n  // Done.\n  return std::make_tuple(y, so, 0);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns);\n\nREGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, MUSA, filtered_lrelu_op);\n\n#else\n\n#pragma message(                           \\\n    \"filtered_lrelu_op is not available. \" \\\n    \"Please update your compiler and musa version.\")\n\n#endif\n#undef BUILD_FILTERED_LRELU_OP\n\n//------------------------------------------------------------------------\n\ntorch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,\n                                    int sy, float gain, float slope,\n                                    float clamp, bool writeSigns) {\n  // Set MUSA device.\n  TORCH_CHECK(x.is_privateuseone(), \"x must reside on MUSA device\");\n  const at::musa::OptionalMUSAGuard device_guard(device_of(x));\n\n  // Validate arguments.\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&\n                  x.size(3) <= INT_MAX,\n              \"x is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x is empty\");\n  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||\n                  x.dtype() == torch::kDouble,\n              \"x must be float16, float32 or float64\");\n\n  // Output signs if we don't have sign input.\n  torch::Tensor so;\n  torch::Tensor s = si;\n  bool readSigns = !!s.numel();\n  if (writeSigns) {\n    int64_t sw = x.size(3);\n    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.\n    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},\n                          x.options().dtype(torch::kUInt8),\n                          at::MemoryFormat::Contiguous);\n  }\n\n  // Validate sign tensor if in use.\n  if (readSigns || writeSigns) {\n    TORCH_CHECK(s.is_contiguous(), \"signs must be contiguous\");\n    TORCH_CHECK(s.dtype() == torch::kUInt8, \"signs must be uint8\");\n    TORCH_CHECK(s.device() == x.device(),\n                \"signs must reside on the same device as x\");\n    TORCH_CHECK(s.dim() == 4, \"signs must be rank 4\");\n    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),\n                \"signs must have same batch & channels as x\");\n    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,\n                \"signs tensor is too large\");\n  }\n\n  // Initialize MUSA kernel parameters.\n  filtered_lrelu_act_kernel_params p;\n  p.x = x.data_ptr();\n  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;\n  p.gain = gain;\n  p.slope = slope;\n  p.clamp = clamp;\n  p.xShape =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.xStride =\n      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));\n  p.sShape = (readSigns || writeSigns)\n                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))\n                 : make_int2(0, 0);  // Width is in elements. Contiguous.\n  p.sOfs = make_int2(sx, sy);\n\n  // Choose MUSA kernel.\n  void *func = 0;\n  AT_DISPATCH_FLOATING_TYPES(\n      x.scalar_type(), \"filtered_lrelu_act_musa\", [&] {\n        if (writeSigns)\n          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();\n        else if (readSigns)\n          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();\n        else\n          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();\n      });\n  TORCH_CHECK(func, \"internal error - MUSA kernel not found\");\n\n  // Launch MUSA kernel.\n  void *args[] = {&p};\n  int bx = 128;  // 4 warps per block.\n\n  // Logical size of launch = writeSigns ? p.s : p.x\n  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;\n  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;\n  uint32_t gz =\n      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.\n  gx = (gx - 1) / bx + 1;\n\n  // Make sure grid y and z dimensions are within MUSA launch limits. Kernel\n  // loops internally to do the rest.\n  const uint32_t gmax = 65535;\n  gy = std::min(gy, gmax);\n  gz = std::min(gz, gmax);\n\n  // Launch.\n#ifdef MMCV_WITH_HIP\n  AT_MUSA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,\n                                c10::musa::getCurrentMUSAStream()));\n#else\n  AT_MUSA_CHECK(musaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,\n                                 c10::musa::getCurrentMUSAStream()));\n#endif\n\n  return so;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/focal_loss_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"sigmoid_focal_loss_musa_kernel.muh\"\n#include \"softmax_focal_loss_musa_kernel.muh\"\n\nvoid SigmoidFocalLossForwardMUSAKernelLauncher(Tensor input, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha) {\n  int output_size = output.numel();\n  int num_classes = input.size(1);\n  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,\n             \"target label should smaller or equal than num classes\");\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sigmoid_focal_loss_forward_musa_kernel\", [&] {\n        sigmoid_focal_loss_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SigmoidFocalLossBackwardMUSAKernelLauncher(Tensor input, Tensor target,\n                                                Tensor weight,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha) {\n  int output_size = grad_input.numel();\n  int num_classes = input.size(1);\n\n  c10::musa::MUSAGuard device_guard(grad_input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"sigmoid_focal_loss_backward_musa_kernel\", [&] {\n        sigmoid_focal_loss_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SoftmaxFocalLossForwardMUSAKernelLauncher(Tensor softmax, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha) {\n  int output_size = output.numel();\n  int num_classes = softmax.size(1);\n\n  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,\n             \"target label should smaller or equal than num classes\");\n  c10::musa::MUSAGuard device_guard(softmax.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      softmax.scalar_type(), \"softmax_focal_loss_forward_musa_kernel\", [&] {\n        softmax_focal_loss_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SoftmaxFocalLossBackwardMUSAKernelLauncher(Tensor softmax, Tensor target,\n                                                Tensor weight, Tensor buff,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha) {\n  int num_classes = softmax.size(1);\n\n  int output_size = buff.numel();\n  c10::musa::MUSAGuard device_guard(grad_input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_input.scalar_type(),\n      \"softmax_focal_loss_backward_musa1_\"\n      \"kernel\",\n      [&] {\n        softmax_focal_loss_backward_musa1_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),\n                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  output_size = grad_input.numel();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_input.scalar_type(),\n      \"softmax_focal_loss_backward_musa2_\"\n      \"kernel\",\n      [&] {\n        softmax_focal_loss_backward_musa2_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, softmax.data_ptr<scalar_t>(),\n                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),\n                grad_input.data_ptr<scalar_t>(), num_classes);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/furthest_point_sample_musa.mu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu\n\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"furthest_point_sample_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\ninline int opt_n_threads(int work_size) {\n  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);\n\n  return max(min(1 << pow_2, 1024), 1);\n}\n\nvoid FurthestPointSamplingForwardMUSAKernelLauncher(int b, int n, int m,\n                                                    const float* dataset,\n                                                    float* temp, int* idxs) {\n  // dataset: (B, N, 3)\n  // tmp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  unsigned int n_threads = opt_n_threads(n);\n\n  switch (n_threads) {\n    case 1024:\n      furthest_point_sampling_forward_musa_kernel<1024>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 512:\n      furthest_point_sampling_forward_musa_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 256:\n      furthest_point_sampling_forward_musa_kernel<256>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 128:\n      furthest_point_sampling_forward_musa_kernel<128>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 64:\n      furthest_point_sampling_forward_musa_kernel<64>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 32:\n      furthest_point_sampling_forward_musa_kernel<32>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 16:\n      furthest_point_sampling_forward_musa_kernel<16>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 8:\n      furthest_point_sampling_forward_musa_kernel<8>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 4:\n      furthest_point_sampling_forward_musa_kernel<4>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 2:\n      furthest_point_sampling_forward_musa_kernel<2>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 1:\n      furthest_point_sampling_forward_musa_kernel<1>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    default:\n      furthest_point_sampling_forward_musa_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid FurthestPointSamplingWithDistForwardMUSAKernelLauncher(\n    int b, int n, int m, const float* dataset, float* temp, int* idxs) {\n  // dataset: (B, N, N)\n  // temp: (B, N)\n  // output:\n  //      idx: (B, M)\n\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  unsigned int n_threads = opt_n_threads(n);\n\n  switch (n_threads) {\n    case 1024:\n      furthest_point_sampling_with_dist_forward_musa_kernel<1024>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 512:\n      furthest_point_sampling_with_dist_forward_musa_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 256:\n      furthest_point_sampling_with_dist_forward_musa_kernel<256>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 128:\n      furthest_point_sampling_with_dist_forward_musa_kernel<128>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 64:\n      furthest_point_sampling_with_dist_forward_musa_kernel<64>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 32:\n      furthest_point_sampling_with_dist_forward_musa_kernel<32>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 16:\n      furthest_point_sampling_with_dist_forward_musa_kernel<16>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 8:\n      furthest_point_sampling_with_dist_forward_musa_kernel<8>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 4:\n      furthest_point_sampling_with_dist_forward_musa_kernel<4>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 2:\n      furthest_point_sampling_with_dist_forward_musa_kernel<2>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    case 1:\n      furthest_point_sampling_with_dist_forward_musa_kernel<1>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n      break;\n    default:\n      furthest_point_sampling_with_dist_forward_musa_kernel<512>\n          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/fused_bias_leakyrelu_musa.mu",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu\n// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.\n//\n// This work is made available under the Nvidia Source Code License-NC.\n// To view a copy of this license, visit\n// https://nvlabs.github.io/stylegan2/license.html\n\n#include <ATen/ATen.h>\n#include <ATen/AccumulateType.h>\n#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"\n#include <musa.h>\n#include <musa_runtime.h>\n#include <torch/types.h>\n\n#include <ATen/musa/MUSA_PORT_ApplyUtils.muh>\n\ntemplate <typename scalar_t>\nstatic __global__ void fused_bias_act_kernel(\n    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,\n    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,\n    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {\n  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;\n\n  scalar_t zero = 0.0;\n\n  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;\n       loop_idx++, xi += blockDim.x) {\n    scalar_t x = p_x[xi];\n\n    if (use_bias) {\n      x += p_b[(xi / step_b) % size_b];\n    }\n\n    scalar_t ref = use_ref ? p_ref[xi] : zero;\n\n    scalar_t y;\n\n    // act = 1: linear layer\n    // act = 3: leaky relu layer\n    // grad = 0: direct forward path\n    // grad = 1: first order deviation\n    // grad = 2: second order deviation\n    switch (act * 10 + grad) {\n      default:\n      case 10:\n        y = x;\n        break;\n      case 11:\n        y = x;\n        break;\n      case 12:\n        y = 0.0;\n        break;\n\n      case 30:\n        y = (x > 0.0) ? x : x * alpha;\n        break;\n      case 31:\n        y = (ref > 0.0) ? x : x * alpha;\n        break;\n      case 32:\n        y = 0.0;\n        break;\n    }\n\n    out[xi] = y * scale;\n  }\n}\n\ntorch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,\n                                      const torch::Tensor& bias,\n                                      const torch::Tensor& refer, int act,\n                                      int grad, float alpha, float scale) {\n  int curDevice = -1;\n  musaGetDevice(&curDevice);\n  musaStream_t stream = at::musa::getCurrentMUSAStream(curDevice);\n\n  auto x = input.contiguous();\n  auto b = bias.contiguous();\n  auto ref = refer.contiguous();\n\n  int use_bias = b.numel() ? 1 : 0;\n  int use_ref = ref.numel() ? 1 : 0;\n\n  int size_x = x.numel();\n  int size_b = b.numel();\n  int step_b = 1;\n\n  for (int i = 1 + 1; i < x.dim(); i++) {\n    step_b *= x.size(i);\n  }\n\n  int loop_x = 4;\n  int block_size = 4 * 32;\n  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;\n\n  auto y = torch::empty_like(x);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      x.scalar_type(), \"fused_bias_act_kernel\", [&] {\n        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(\n            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),\n            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,\n            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);\n      });\n\n  return y;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/fused_spconv_ops_musa.mu",
    "content": "#include <musa_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/reordering.h>\n\n#include \"pytorch_musa_helper.hpp\"\n\ntorch::Tensor FusedIndiceConvBatchnormMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n\n  torch::Tensor output =\n      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {  // the center index of subm conv don't need gather and scatter\n               // add.\n    torch::mm_out(output, features, filters[indicePairMaxOffset]);\n  }\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES(\n        features.scalar_type(), \"FusedIndiceConvBatchnormKernel\", [&] {\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_MUSA_ERR();\n            /* slower than SparseGatherFunctor, may due to int->long conversion\n            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);\n            auto indicePairBlob =\n            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},\n            indicePairOptions); torch::index_select_out(inputBufferBlob,\n            features, 0, indicePairBlob);*/\n          }\n          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);\n\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n            TV_CHECK_MUSA_ERR();\n          }\n        });\n  }\n\n  return output;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/gather_points_musa.mu",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n\n#include \"gather_points_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid GatherPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                           const Tensor points,\n                                           const Tensor idx, Tensor out) {\n  // points: (B, C, N)\n  // idx: (B, npoints)\n  // output:\n  //      out: (B, C, npoints)\n\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"gather_points_forward_musa_kernel\", [&] {\n        gather_points_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, points.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid GatherPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                            const Tensor grad_out,\n                                            const Tensor idx,\n                                            Tensor grad_points) {\n  // grad_out: (B, C, npoints)\n  // idx: (B, npoints)\n  // output:\n  //      grad_points: (B, C, N)\n\n  c10::musa::MUSAGuard device_guard(grad_out.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"gather_points_backward_musa_kernel\", [&] {\n        gather_points_backward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/group_points_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"group_points_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid GroupPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                          int nsample, const Tensor points,\n                                          const Tensor idx, Tensor out) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"group_points_forward_musa_kernel\", [&] {\n        group_points_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid GroupPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                           int nsample, const Tensor grad_out,\n                                           const Tensor idx,\n                                           Tensor grad_points) {\n  // grad_out: (B, C, npoints, nsample)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      grad_points: (B, C, N)\n\n  c10::musa::MUSAGuard device_guard(grad_out.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"group_points_backward_musa_kernel\", [&] {\n        group_points_backward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),\n                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/iou3d_musa.mu",
    "content": "// Modified from\n// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu\n\n/*\n3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)\nWritten by Shaoshuai Shi\nAll Rights Reserved 2019-2020.\n*/\n\n#include <stdio.h>\n\n#include \"iou3d_musa_kernel.muh\"\n#include \"nms_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(const int num_a,\n                                                   const Tensor boxes_a,\n                                                   const int num_b,\n                                                   const Tensor boxes_b,\n                                                   Tensor ans_overlap) {\n  c10::musa::MUSAGuard device_guard(boxes_a.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),\n              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));\n  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);\n\n  iou3d_boxes_overlap_bev_forward_musa_kernel<<<blocks, threads, 0, stream>>>(\n      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),\n      ans_overlap.data_ptr<float>());\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid IoU3DNMS3DForwardMUSAKernelLauncher(const Tensor boxes, Tensor& keep,\n                                         Tensor& keep_num,\n                                         float nms_overlap_thresh) {\n  using namespace at::indexing;\n  c10::musa::MUSAGuard device_guard(boxes.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  int boxes_num = boxes.size(0);\n\n  const int col_blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n\n  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),\n              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));\n  dim3 threads(THREADS_PER_BLOCK_NMS);\n\n  iou3d_nms3d_forward_musa_kernel<<<blocks, threads, 0, stream>>>(\n      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n\n  auto keep_data = keep_t.nonzero().index({Slice(), 0});\n  keep_num.fill_(at::Scalar(keep_data.size(0)));\n  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid IoU3DNMS3DNormalForwardMUSAKernelLauncher(const Tensor boxes, Tensor& keep,\n                                               Tensor& keep_num,\n                                               float nms_overlap_thresh) {\n  using namespace at::indexing;\n  c10::musa::MUSAGuard device_guard(boxes.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  int boxes_num = boxes.size(0);\n\n  const int col_blocks =\n      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n\n  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),\n              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));\n  dim3 threads(THREADS_PER_BLOCK_NMS);\n\n  iou3d_nms3d_normal_forward_musa_kernel<<<blocks, threads, 0, stream>>>(\n      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n\n  auto keep_data = keep_t.nonzero().index({Slice(), 0});\n  keep_num.fill_(at::Scalar(keep_data.size(0)));\n  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/knn_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap\n\n#include <cmath>\n#include <cstdio>\n\n#include \"knn_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid KNNForwardMUSAKernelLauncher(int b, int n, int m, int nsample,\n                                  const Tensor xyz, const Tensor new_xyz,\n                                  Tensor idx, Tensor dist2) {\n  // param new_xyz: (B, m, 3)\n  // param xyz: (B, n, 3)\n  // param idx: (B, m, nsample)\n\n  c10::musa::MUSAGuard device_guard(new_xyz.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      new_xyz.scalar_type(), \"knn_forward_musa_kernel\", [&] {\n        knn_forward_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            b, n, m, nsample, xyz.data_ptr<scalar_t>(),\n            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n            dist2.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/masked_conv2d_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"masked_conv2d_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid MaskedIm2colForwardMUSAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int kernel_h,\n                                           const int kernel_w, const int pad_h,\n                                           const int pad_w) {\n  int channels = bottom_data.size(1);\n  int height = bottom_data.size(2);\n  int width = bottom_data.size(3);\n  int mask_cnt = mask_h_idx.size(0);\n  int output_size = mask_cnt * channels;\n\n  c10::musa::MUSAGuard device_guard(bottom_data.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      bottom_data.scalar_type(), \"MaskedIm2colLaucherForward\", ([&] {\n        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();\n        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();\n        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();\n        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();\n        MaskedIm2colForward<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data_, height, width, kernel_h, kernel_w,\n                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid MaskedCol2imForwardMUSAKernelLauncher(\n    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,\n    Tensor top_data, const int height, const int width, const int channels) {\n  int mask_cnt = mask_h_idx.size(0);\n  int output_size = mask_cnt * channels;\n\n  c10::musa::MUSAGuard device_guard(bottom_data.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      bottom_data.scalar_type(), \"MaskedCol2imLaucherForward\", ([&] {\n        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();\n        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();\n        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();\n        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();\n\n        MaskedCol2imForward<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data_, height, width, channels, mask_h_idx_,\n                mask_w_idx_, mask_cnt, top_data_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/min_area_polygons.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// modified from\n// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu\n#include \"min_area_polygons_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid MinAreaPolygonsMUSAKernelLauncher(const Tensor pointsets,\n                                       Tensor polygons) {\n  int num_pointsets = pointsets.size(0);\n  const int output_size = polygons.numel();\n  c10::musa::MUSAGuard device_guard(pointsets.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      pointsets.scalar_type(), \"min_area_polygons_musa_kernel\", ([&] {\n        min_area_polygons_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                num_pointsets, pointsets.data_ptr<scalar_t>(),\n                polygons.data_ptr<scalar_t>());\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/modulated_deform_conv_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"modulated_deform_conv_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid modulated_deformable_im2col_musa(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col) {\n  // num_axes should be smaller than block size\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels = channels * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_im.scalar_type(), \"modulated_deformable_im2col_gpu\", ([&] {\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n\n        modulated_deformable_im2col_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_im_, data_offset_, data_mask_, height_im,\n            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,\n            dilation_h, dilation_w, channel_per_deformable_group, batch_size,\n            channels, deformable_group, height_col, width_col, data_col_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid modulated_deformable_col2im_musa(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im) {\n  const int channel_per_deformable_group = channels / deformable_group;\n  const int num_kernels =\n      channels * kernel_h * kernel_w * batch_size * height_col * width_col;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_col_, data_offset_, data_mask_, channels,\n            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,\n            stride_w, dilation_h, dilation_w, channel_per_deformable_group,\n            batch_size, deformable_group, height_col, width_col, grad_im_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid modulated_deformable_col2im_coord_musa(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask) {\n  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *\n                          kernel_w * deformable_group;\n  const int channel_per_deformable_group =\n      channels * kernel_h * kernel_w / deformable_group;\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      data_col.scalar_type(), \"modulated_deformable_col2im_coord_gpu\", ([&] {\n        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();\n        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();\n        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();\n        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();\n        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();\n        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();\n\n        modulated_deformable_col2im_coord_gpu_kernel<<<\n            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,\n            c10::musa::getCurrentMUSAStream()>>>(\n            num_kernels, data_col_, data_im_, data_offset_, data_mask_,\n            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,\n            stride_h, stride_w, dilation_h, dilation_w,\n            channel_per_deformable_group, batch_size,\n            2 * kernel_h * kernel_w * deformable_group, deformable_group,\n            height_col, width_col, grad_offset_, grad_mask_);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/ms_deform_attn_musa.mu",
    "content": "/*!\n**************************************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 SenseTime. All Rights Reserved.\n* Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n**************************************************************************************************\n* Modified from\n*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n**************************************************************************************************\n*/\n\n#include <ATen/ATen.h>\n#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"\n#include <musa.h>\n#include <musa_runtime.h>\n\n#include <THC/THCAtomics.muh>\n#include <vector>\n\n#include \"ms_deform_attn_musa_kernel.muh\"\n\ntemplate <typename scalar_t>\nvoid ms_deformable_im2col_musa(musaStream_t stream, const scalar_t *data_value,\n                               const int64_t *data_spatial_shapes,\n                               const int64_t *data_level_start_index,\n                               const scalar_t *data_sampling_loc,\n                               const scalar_t *data_attn_weight,\n                               const int batch_size, const int spatial_size,\n                               const int num_heads, const int channels,\n                               const int num_levels, const int num_query,\n                               const int num_point, scalar_t *data_col) {\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  const int num_threads = THREADS_PER_BLOCK;\n  ms_deformable_im2col_gpu_kernel<scalar_t>\n      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(\n          num_kernels, data_value, data_spatial_shapes, data_level_start_index,\n          data_sampling_loc, data_attn_weight, batch_size, spatial_size,\n          num_heads, channels, num_levels, num_query, num_point, data_col);\n\n  musaError_t err = musaGetLastError();\n  if (err != musaSuccess) {\n    printf(\"error in ms_deformable_im2col_musa: %s\\n\", musaGetErrorString(err));\n  }\n}\n\ntemplate <typename scalar_t>\nvoid ms_deformable_col2im_musa(\n    musaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,\n    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,\n    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,\n    const int batch_size, const int spatial_size, const int num_heads,\n    const int channels, const int num_levels, const int num_query,\n    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,\n    scalar_t *grad_attn_weight) {\n  const int num_threads =\n      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;\n  const int num_kernels = batch_size * num_query * num_heads * channels;\n  const int num_actual_kernels = batch_size * num_query * num_heads * channels;\n  if (channels > THREADS_PER_BLOCK) {\n    if ((channels & THREADS_PER_BLOCK - 1) == 0) {\n      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n             num_threads * 3 * sizeof(scalar_t), stream>>>(\n              num_kernels, grad_col, data_value, data_spatial_shapes,\n              data_level_start_index, data_sampling_loc, data_attn_weight,\n              batch_size, spatial_size, num_heads, channels, num_levels,\n              num_query, num_point, grad_value, grad_sampling_loc,\n              grad_attn_weight);\n    } else {\n      ms_deformable_col2im_gpu_kernel_gm<scalar_t>\n          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                       data_level_start_index, data_sampling_loc,\n                       data_attn_weight, batch_size, spatial_size, num_heads,\n                       channels, num_levels, num_query, num_point, grad_value,\n                       grad_sampling_loc, grad_attn_weight);\n    }\n  } else {\n    switch (channels) {\n      case 1:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      1>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 2:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      2>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 4:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      4>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 8:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      8>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 16:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      16>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 32:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,\n                                                                      32>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 64:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      64>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 128:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      128>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 256:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      256>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      case 512:\n        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,\n                                                                      512>\n            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,\n               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,\n                         data_level_start_index, data_sampling_loc,\n                         data_attn_weight, batch_size, spatial_size, num_heads,\n                         channels, num_levels, num_query, num_point, grad_value,\n                         grad_sampling_loc, grad_attn_weight);\n        break;\n      default:\n        if (channels < 64) {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>\n              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n                 num_threads * 3 * sizeof(scalar_t), stream>>>(\n                  num_kernels, grad_col, data_value, data_spatial_shapes,\n                  data_level_start_index, data_sampling_loc, data_attn_weight,\n                  batch_size, spatial_size, num_heads, channels, num_levels,\n                  num_query, num_point, grad_value, grad_sampling_loc,\n                  grad_attn_weight);\n        } else {\n          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>\n              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,\n                 num_threads * 3 * sizeof(scalar_t), stream>>>(\n                  num_kernels, grad_col, data_value, data_spatial_shapes,\n                  data_level_start_index, data_sampling_loc, data_attn_weight,\n                  batch_size, spatial_size, num_heads, channels, num_levels,\n                  num_query, num_point, grad_value, grad_sampling_loc,\n                  grad_attn_weight);\n        }\n    }\n  }\n  musaError_t err = musaGetLastError();\n  if (err != musaSuccess) {\n    printf(\"error in ms_deformable_col2im_musa: %s\\n\", musaGetErrorString(err));\n  }\n}\n\nat::Tensor ms_deform_attn_musa_forward(const at::Tensor &value,\n                                       const at::Tensor &spatial_shapes,\n                                       const at::Tensor &level_start_index,\n                                       const at::Tensor &sampling_loc,\n                                       const at::Tensor &attn_weight,\n                                       const int im2col_step) {\n  AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n  AT_ASSERTM(spatial_shapes.is_contiguous(),\n             \"spatial_shapes tensor has to be contiguous\");\n  AT_ASSERTM(level_start_index.is_contiguous(),\n             \"level_start_index tensor has to be contiguous\");\n  AT_ASSERTM(sampling_loc.is_contiguous(),\n             \"sampling_loc tensor has to be contiguous\");\n  AT_ASSERTM(attn_weight.is_contiguous(),\n             \"attn_weight tensor has to be contiguous\");\n\n  AT_ASSERTM(value.is_privateuseone(), \"value must be a MUSA tensor\");\n  AT_ASSERTM(spatial_shapes.is_privateuseone(), \"spatial_shapes must be a MUSA tensor\");\n  AT_ASSERTM(level_start_index.is_privateuseone(),\n             \"level_start_index must be a MUSA tensor\");\n  AT_ASSERTM(sampling_loc.is_privateuseone(), \"sampling_loc must be a MUSA tensor\");\n  AT_ASSERTM(attn_weight.is_privateuseone(), \"attn_weight must be a MUSA tensor\");\n\n  const int batch = value.size(0);\n  const int spatial_size = value.size(1);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n\n  const int num_levels = spatial_shapes.size(0);\n\n  const int num_query = sampling_loc.size(1);\n  const int num_point = sampling_loc.size(4);\n\n  const int im2col_step_ = std::min(batch, im2col_step);\n\n  AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\",\n             batch, im2col_step_);\n\n  auto output =\n      at::zeros({batch, num_query, num_heads, channels}, value.options());\n\n  const int batch_n = im2col_step_;\n  auto output_n = output.view(\n      {batch / im2col_step_, batch_n, num_query, num_heads, channels});\n  auto per_value_size = spatial_size * num_heads * channels;\n  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n  for (int n = 0; n < batch / im2col_step_; ++n) {\n    auto columns = output_n.select(0, n);\n    AT_DISPATCH_FLOATING_TYPES(\n        value.scalar_type(), \"ms_deform_attn_forward_musa\", ([&] {\n          ms_deformable_im2col_musa(\n              c10::musa::getCurrentMUSAStream(),\n              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,\n              spatial_shapes.data_ptr<int64_t>(),\n              level_start_index.data_ptr<int64_t>(),\n              sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size,\n              batch_n, spatial_size, num_heads, channels, num_levels, num_query,\n              num_point, columns.data_ptr<scalar_t>());\n        }));\n  }\n\n  output = output.view({batch, num_query, num_heads * channels});\n\n  return output;\n}\n\nvoid ms_deform_attn_musa_backward(\n    const at::Tensor &value, const at::Tensor &spatial_shapes,\n    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,\n    const at::Tensor &attn_weight, const at::Tensor &grad_output,\n    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,\n    at::Tensor &grad_attn_weight, const int im2col_step) {\n  AT_ASSERTM(value.is_contiguous(), \"value tensor has to be contiguous\");\n  AT_ASSERTM(spatial_shapes.is_contiguous(),\n             \"spatial_shapes tensor has to be contiguous\");\n  AT_ASSERTM(level_start_index.is_contiguous(),\n             \"level_start_index tensor has to be contiguous\");\n  AT_ASSERTM(sampling_loc.is_contiguous(),\n             \"sampling_loc tensor has to be contiguous\");\n  AT_ASSERTM(attn_weight.is_contiguous(),\n             \"attn_weight tensor has to be contiguous\");\n  AT_ASSERTM(grad_output.is_contiguous(),\n             \"grad_output tensor has to be contiguous\");\n\n  AT_ASSERTM(value.is_privateuseone(), \"value must be a MUSA tensor\");\n  AT_ASSERTM(spatial_shapes.is_privateuseone(), \"spatial_shapes must be a MUSA tensor\");\n  AT_ASSERTM(level_start_index.is_privateuseone(),\n             \"level_start_index must be a MUSA tensor\");\n  AT_ASSERTM(sampling_loc.is_privateuseone(), \"sampling_loc must be a MUSA tensor\");\n  AT_ASSERTM(attn_weight.is_privateuseone(), \"attn_weight must be a MUSA tensor\");\n  AT_ASSERTM(grad_output.is_privateuseone(), \"grad_output must be a MUSA tensor\");\n\n  const int batch = value.size(0);\n  const int spatial_size = value.size(1);\n  const int num_heads = value.size(2);\n  const int channels = value.size(3);\n\n  const int num_levels = spatial_shapes.size(0);\n\n  const int num_query = sampling_loc.size(1);\n  const int num_point = sampling_loc.size(4);\n\n  const int im2col_step_ = std::min(batch, im2col_step);\n\n  AT_ASSERTM(batch % im2col_step_ == 0, \"batch(%d) must divide im2col_step(%d)\",\n             batch, im2col_step_);\n\n  const int batch_n = im2col_step_;\n  auto per_value_size = spatial_size * num_heads * channels;\n  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;\n  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;\n  auto grad_output_n = grad_output.view(\n      {batch / im2col_step_, batch_n, num_query, num_heads, channels});\n\n  for (int n = 0; n < batch / im2col_step_; ++n) {\n    auto grad_output_g = grad_output_n.select(0, n);\n    AT_DISPATCH_FLOATING_TYPES(\n        value.scalar_type(), \"ms_deform_attn_backward_musa\", ([&] {\n          ms_deformable_col2im_musa(\n              c10::musa::getCurrentMUSAStream(),\n              grad_output_g.data_ptr<scalar_t>(),\n              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,\n              spatial_shapes.data_ptr<int64_t>(),\n              level_start_index.data_ptr<int64_t>(),\n              sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size,\n              batch_n, spatial_size, num_heads, channels, num_levels, num_query,\n              num_point,\n              grad_value.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_value_size,\n              grad_sampling_loc.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_sample_loc_size,\n              grad_attn_weight.data_ptr<scalar_t>() +\n                  n * im2col_step_ * per_attn_weight_size);\n        }));\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/musabind.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\nvoid AssignScoreWithKForwardMUSAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &points, const Tensor &centers, const Tensor &scores,\n    const Tensor &knn_idx, Tensor &output);\n\nvoid AssignScoreWithKBackwardMUSAKernelLauncher(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores);\n\nvoid assign_score_withk_forward_musa(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor &points,\n                                     const Tensor &centers,\n                                     const Tensor &scores,\n                                     const Tensor &knn_idx, Tensor &output) {\n  AssignScoreWithKForwardMUSAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);\n};\n\nvoid assign_score_withk_backward_musa(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores) {\n  AssignScoreWithKBackwardMUSAKernelLauncher(\n      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,\n      grad_points, grad_centers, grad_scores);\n};\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor &points,\n                                     const Tensor &centers,\n                                     const Tensor &scores,\n                                     const Tensor &knn_idx, Tensor &output);\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor &grad_out, const Tensor &points, const Tensor &centers,\n    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,\n    Tensor &grad_centers, Tensor &grad_scores);\n\nREGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, MUSA,\n                     assign_score_withk_forward_musa);\nREGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, MUSA,\n                     assign_score_withk_backward_musa);\n\nvoid BallQueryForwardMUSAKernelLauncher(int b, int n, int m, float min_radius,\n                                        float max_radius, int nsample,\n                                        const Tensor new_xyz, const Tensor xyz,\n                                        Tensor idx);\n\nvoid ball_query_forward_musa(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx) {\n  BallQueryForwardMUSAKernelLauncher(b, n, m, min_radius, max_radius, nsample,\n                                     new_xyz, xyz, idx);\n};\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx);\nREGISTER_DEVICE_IMPL(ball_query_forward_impl, MUSA, ball_query_forward_musa);\n\nvoid StackBallQueryForwardMUSAKernelLauncher(float max_radius, int nsample,\n                                             const Tensor new_xyz,\n                                             const Tensor new_xyz_batch_cnt,\n                                             const Tensor xyz,\n                                             const Tensor xyz_batch_cnt,\n                                             Tensor idx);\n\nvoid stack_ball_query_forward_musa(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx) {\n  StackBallQueryForwardMUSAKernelLauncher(\n      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);\n};\n\nvoid stack_ball_query_forward_impl(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx);\nREGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, MUSA,\n                     stack_ball_query_forward_musa);\n\nvoid BBoxOverlapsMUSAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,\n                                    Tensor ious, const int mode,\n                                    const bool aligned, const int offset);\n\nvoid bbox_overlaps_musa(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset) {\n  BBoxOverlapsMUSAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);\n}\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\nREGISTER_DEVICE_IMPL(bbox_overlaps_impl, MUSA, bbox_overlaps_musa);\n\nvoid BorderAlignForwardMUSAKernelLauncher(const Tensor &input,\n                                          const Tensor &boxes, Tensor output,\n                                          Tensor argmax_idx,\n                                          const int pool_size);\n\nvoid BorderAlignBackwardMUSAKernelLauncher(const Tensor &grad_output,\n                                           const Tensor &boxes,\n                                           const Tensor &argmax_idx,\n                                           Tensor grad_input,\n                                           const int pool_size);\n\nvoid border_align_forward_musa(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size) {\n  BorderAlignForwardMUSAKernelLauncher(input, boxes, output, argmax_idx,\n                                       pool_size);\n}\n\nvoid border_align_backward_musa(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size) {\n  BorderAlignBackwardMUSAKernelLauncher(grad_output, boxes, argmax_idx,\n                                        grad_input, pool_size);\n}\n\nvoid border_align_forward_impl(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size);\n\nvoid border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size);\n\nREGISTER_DEVICE_IMPL(border_align_forward_impl, MUSA,\n                     border_align_forward_musa);\nREGISTER_DEVICE_IMPL(border_align_backward_impl, MUSA,\n                     border_align_backward_musa);\n\nvoid box_iou_rotated_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_rotated_impl, MUSA, box_iou_rotated_musa);\n\nvoid box_iou_quadri_musa(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\n\nvoid box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\nREGISTER_DEVICE_IMPL(box_iou_quadri_impl, MUSA, box_iou_quadri_musa);\n\n#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))\n\nvoid CARAFEForwardMUSAKernelLauncher(const Tensor features, const Tensor masks,\n                                     Tensor rfeatures, Tensor routput,\n                                     Tensor rmasks, Tensor output,\n                                     const int kernel_size,\n                                     const int group_size,\n                                     const int scale_factor);\n\nvoid CARAFEBackwardMUSAKernelLauncher(\n    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,\n    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,\n    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,\n    const int kernel_size, const int group_size, const int scale_factor);\n\nvoid carafe_forward_musa(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor) {\n  CARAFEForwardMUSAKernelLauncher(features, masks, rfeatures, routput, rmasks,\n                                  output, kernel_size, group_size,\n                                  scale_factor);\n}\n\nvoid carafe_backward_musa(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor) {\n  CARAFEBackwardMUSAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,\n                                   rbottom_grad_hs, rbottom_grad, rmask_grad,\n                                   bottom_grad, mask_grad, kernel_size,\n                                   group_size, scale_factor);\n}\n\nvoid carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,\n                         Tensor routput, Tensor rmasks, Tensor output,\n                         int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                          Tensor rtop_grad, Tensor rbottom_grad_hs,\n                          Tensor rbottom_grad, Tensor rmask_grad,\n                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,\n                          int group_size, int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_forward_impl, MUSA, carafe_forward_musa);\nREGISTER_DEVICE_IMPL(carafe_backward_impl, MUSA, carafe_backward_musa);\n#endif\n\nvoid CARAFENAIVEForwardMUSAKernelLauncher(const Tensor features,\n                                          const Tensor masks, Tensor output,\n                                          const int kernel_size,\n                                          const int group_size,\n                                          const int scale_factor);\n\nvoid CARAFENAIVEBackwardMUSAKernelLauncher(\n    const Tensor top_grad, const Tensor features, const Tensor masks,\n    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,\n    const int group_size, const int scale_factor);\n\nvoid carafe_naive_forward_musa(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor) {\n  CARAFENAIVEForwardMUSAKernelLauncher(features, masks, output, kernel_size,\n                                       group_size, scale_factor);\n}\n\nvoid carafe_naive_backward_musa(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor) {\n  CARAFENAIVEBackwardMUSAKernelLauncher(top_grad, features, masks, bottom_grad,\n                                        mask_grad, kernel_size, group_size,\n                                        scale_factor);\n}\nvoid carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,\n                               int kernel_size, int group_size,\n                               int scale_factor);\n\nvoid carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,\n                                Tensor bottom_grad, Tensor mask_grad,\n                                int kernel_size, int group_size,\n                                int scale_factor);\n\nREGISTER_DEVICE_IMPL(carafe_naive_forward_impl, MUSA,\n                     carafe_naive_forward_musa);\nREGISTER_DEVICE_IMPL(carafe_naive_backward_impl, MUSA,\n                     carafe_naive_backward_musa);\n\nvoid CorrelationForwardMUSAKernelLauncher(Tensor input1, Tensor input2,\n                                          Tensor output, int kH, int kW,\n                                          int patchH, int patchW, int padH,\n                                          int padW, int dilationH,\n                                          int dilationW, int dilation_patchH,\n                                          int dilation_patchW, int dH, int dW);\n\nvoid CorrelationBackwardMUSAKernelLauncher(Tensor grad_output, Tensor input1,\n                                           Tensor input2, Tensor grad_input1,\n                                           Tensor grad_input2, int kH, int kW,\n                                           int patchH, int patchW, int padH,\n                                           int padW, int dilationH,\n                                           int dilationW, int dilation_patchH,\n                                           int dilation_patchW, int dH, int dW);\n\nvoid correlation_forward_musa(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW) {\n  CorrelationForwardMUSAKernelLauncher(\n      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,\n      dilationW, dilation_patchH, dilation_patchW, dH, dW);\n}\n\nvoid correlation_backward_musa(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW) {\n  CorrelationBackwardMUSAKernelLauncher(\n      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,\n      patchW, padH, padW, dilationH, dilationW, dilation_patchH,\n      dilation_patchW, dH, dW);\n}\n\nvoid correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,\n                              int kH, int kW, int patchH, int patchW, int padH,\n                              int padW, int dilationH, int dilationW,\n                              int dilation_patchH, int dilation_patchW, int dH,\n                              int dW);\n\nvoid correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,\n                               Tensor grad_input1, Tensor grad_input2, int kH,\n                               int kW, int patchH, int patchW, int padH,\n                               int padW, int dilationH, int dilationW,\n                               int dilation_patchH, int dilation_patchW, int dH,\n                               int dW);\n\nREGISTER_DEVICE_IMPL(correlation_forward_impl, MUSA, correlation_forward_musa);\nREGISTER_DEVICE_IMPL(correlation_backward_impl, MUSA,\n                     correlation_backward_musa);\n\nvoid deformable_im2col_musa(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_musa(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_musa(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nvoid deformable_im2col_impl(Tensor data_im, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor data_col);\n\nvoid deformable_col2im_impl(Tensor data_col, Tensor data_offset,\n                            const int channels, const int height,\n                            const int width, const int ksize_h,\n                            const int ksize_w, const int pad_h, const int pad_w,\n                            const int stride_h, const int stride_w,\n                            const int dilation_h, const int dilation_w,\n                            const int parallel_imgs, const int deformable_group,\n                            Tensor grad_im);\n\nvoid deformable_col2im_coord_impl(\n    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,\n    const int height, const int width, const int ksize_h, const int ksize_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int parallel_imgs,\n    const int deformable_group, Tensor grad_offset);\n\nREGISTER_DEVICE_IMPL(deformable_im2col_impl, MUSA, deformable_im2col_musa);\nREGISTER_DEVICE_IMPL(deformable_col2im_impl, MUSA, deformable_col2im_musa);\nREGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, MUSA,\n                     deformable_col2im_coord_musa);\n\nvoid DeformRoIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                            Tensor offset, Tensor output,\n                                            int pooled_height, int pooled_width,\n                                            float spatial_scale,\n                                            int sampling_ratio, float gamma);\n\nvoid DeformRoIPoolBackwardMUSAKernelLauncher(\n    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,\n    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,\n    float spatial_scale, int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_forward_musa(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  DeformRoIPoolForwardMUSAKernelLauncher(input, rois, offset, output,\n                                         pooled_height, pooled_width,\n                                         spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_backward_musa(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma) {\n  DeformRoIPoolBackwardMUSAKernelLauncher(\n      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,\n      pooled_width, spatial_scale, sampling_ratio, gamma);\n}\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n\nREGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MUSA,\n                     deform_roi_pool_forward_musa);\nREGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MUSA,\n                     deform_roi_pool_backward_musa);\n\nvoid SigmoidFocalLossForwardMUSAKernelLauncher(Tensor input, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SigmoidFocalLossBackwardMUSAKernelLauncher(Tensor input, Tensor target,\n                                                Tensor weight,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid SoftmaxFocalLossForwardMUSAKernelLauncher(Tensor softmax, Tensor target,\n                                               Tensor weight, Tensor output,\n                                               const float gamma,\n                                               const float alpha);\n\nvoid SoftmaxFocalLossBackwardMUSAKernelLauncher(Tensor softmax, Tensor target,\n                                                Tensor weight, Tensor buff,\n                                                Tensor grad_input,\n                                                const float gamma,\n                                                const float alpha);\n\nvoid sigmoid_focal_loss_forward_musa(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SigmoidFocalLossForwardMUSAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_backward_musa(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha) {\n  SigmoidFocalLossBackwardMUSAKernelLauncher(input, target, weight, grad_input,\n                                             gamma, alpha);\n}\n\nvoid softmax_focal_loss_forward_musa(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha) {\n  SoftmaxFocalLossForwardMUSAKernelLauncher(input, target, weight, output,\n                                            gamma, alpha);\n}\n\nvoid softmax_focal_loss_backward_musa(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha) {\n  SoftmaxFocalLossBackwardMUSAKernelLauncher(input, target, weight, buff,\n                                             grad_input, gamma, alpha);\n}\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha);\n\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MUSA,\n                     sigmoid_focal_loss_forward_musa);\nREGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MUSA,\n                     sigmoid_focal_loss_backward_musa);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, MUSA,\n                     softmax_focal_loss_forward_musa);\nREGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, MUSA,\n                     softmax_focal_loss_backward_musa);\n\nvoid FurthestPointSamplingForwardMUSAKernelLauncher(int b, int n, int m,\n                                                    const float *dataset,\n                                                    float *temp, int *idxs);\n\nvoid FurthestPointSamplingWithDistForwardMUSAKernelLauncher(\n    int b, int n, int m, const float *dataset, float *temp, int *idxs);\n\nvoid furthest_point_sampling_forward_musa(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m) {\n  const float *dataset = points_tensor.data_ptr<float>();\n  float *temp = temp_tensor.data_ptr<float>();\n  int *idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingForwardMUSAKernelLauncher(b, n, m, dataset, temp, idxs);\n}\n\nvoid furthest_point_sampling_with_dist_forward_musa(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m) {\n  const float *dataset = points_tensor.data_ptr<float>();\n  float *temp = temp_tensor.data_ptr<float>();\n  int *idxs = idx_tensor.data_ptr<int>();\n  FurthestPointSamplingWithDistForwardMUSAKernelLauncher(b, n, m, dataset, temp,\n                                                         idxs);\n}\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m);\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m);\n\nREGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, MUSA,\n                     furthest_point_sampling_forward_musa);\nREGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, MUSA,\n                     furthest_point_sampling_with_dist_forward_musa);\n\ntorch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,\n                                      const torch::Tensor &bias,\n                                      const torch::Tensor &refer, int act,\n                                      int grad, float alpha, float scale);\n\ntorch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,\n                                           const torch::Tensor &bias,\n                                           const torch::Tensor &refer, int act,\n                                           int grad, float alpha, float scale);\nREGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, MUSA,\n                     fused_bias_leakyrelu_op);\n\ntorch::Tensor bias_act_op_impl(const torch::Tensor &input,\n                               const torch::Tensor &bias,\n                               const torch::Tensor &xref,\n                               const torch::Tensor &yref,\n                               const torch::Tensor &dy, int grad, int dim,\n                               int act, float alpha, float gain, float clamp);\n\ntorch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,\n                          const torch::Tensor &xref, const torch::Tensor &yref,\n                          const torch::Tensor &dy, int grad, int dim, int act,\n                          float alpha, float gain, float clamp);\n\nREGISTER_DEVICE_IMPL(bias_act_op_impl, MUSA, bias_act_op);\n\ntorch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,\n                                         int sx, int sy, float gain,\n                                         float slope, float clamp,\n                                         bool writeSigns);\n\ntorch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,\n                                    int sy, float gain, float slope,\n                                    float clamp, bool writeSigns);\n\nREGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, MUSA, filtered_lrelu_act_op);\n\nvoid GatherPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                           const Tensor points,\n                                           const Tensor idx, Tensor out);\n\nvoid GatherPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                            const Tensor grad_out,\n                                            const Tensor idx,\n                                            Tensor grad_points);\n\nvoid gather_points_forward_musa(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out) {\n  GatherPointsForwardMUSAKernelLauncher(b, c, n, npoints, points, idx, out);\n};\n\nvoid gather_points_backward_musa(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points) {\n  GatherPointsBackwardMUSAKernelLauncher(b, c, n, npoints, grad_out, idx,\n                                         grad_points);\n};\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out);\n\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(gather_points_forward_impl, MUSA,\n                     gather_points_forward_musa);\nREGISTER_DEVICE_IMPL(gather_points_backward_impl, MUSA,\n                     gather_points_backward_musa);\n\nvoid GroupPointsForwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                          int nsample, const Tensor points,\n                                          const Tensor idx, Tensor out);\n\nvoid GroupPointsBackwardMUSAKernelLauncher(int b, int c, int n, int npoints,\n                                           int nsample, const Tensor grad_out,\n                                           const Tensor idx,\n                                           Tensor grad_points);\n\nvoid group_points_forward_musa(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  GroupPointsForwardMUSAKernelLauncher(b, c, n, npoints, nsample, points, idx,\n                                       out);\n};\n\nvoid group_points_backward_musa(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  GroupPointsBackwardMUSAKernelLauncher(b, c, n, npoints, nsample, grad_out,\n                                        idx, grad_points);\n};\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out);\n\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points);\n\nREGISTER_DEVICE_IMPL(group_points_forward_impl, MUSA,\n                     group_points_forward_musa);\nREGISTER_DEVICE_IMPL(group_points_backward_impl, MUSA,\n                     group_points_backward_musa);\n\nvoid StackGroupPointsForwardMUSAKernelLauncher(\n    int b, int c, int m, int nsample, const Tensor features_tensor,\n    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,\n    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);\nvoid StackGroupPointsBackwardMUSAKernelLauncher(\n    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,\n    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,\n    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);\n\nvoid stack_group_points_forward_musa(int b, int c, int m, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor) {\n  StackGroupPointsForwardMUSAKernelLauncher(\n      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,\n      idx_batch_cnt_tensor, out_tensor);\n};\n\nvoid stack_group_points_backward_musa(int b, int c, int m, int n, int nsample,\n                                      const Tensor grad_out_tensor,\n                                      const Tensor idx_tensor,\n                                      const Tensor idx_batch_cnt_tensor,\n                                      const Tensor features_batch_cnt_tensor,\n                                      Tensor grad_features_tensor) {\n  StackGroupPointsBackwardMUSAKernelLauncher(\n      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,\n      features_batch_cnt_tensor, grad_features_tensor);\n};\n\nvoid stack_group_points_forward_impl(int b, int c, int m, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor);\n\nvoid stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,\n                                      const Tensor grad_out_tensor,\n                                      const Tensor idx_tensor,\n                                      const Tensor idx_batch_cnt_tensor,\n                                      const Tensor features_batch_cnt_tensor,\n                                      Tensor grad_features_tensor);\n\nREGISTER_DEVICE_IMPL(stack_group_points_forward_impl, MUSA,\n                     stack_group_points_forward_musa);\nREGISTER_DEVICE_IMPL(stack_group_points_backward_impl, MUSA,\n                     stack_group_points_backward_musa);\n\nvoid IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(const int num_a,\n                                                   const Tensor boxes_a,\n                                                   const int num_b,\n                                                   const Tensor boxes_b,\n                                                   Tensor ans_overlap);\n\nvoid IoU3DNMS3DForwardMUSAKernelLauncher(const Tensor boxes, Tensor &keep,\n                                         Tensor &keep_num,\n                                         float nms_overlap_thresh);\n\nvoid IoU3DNMS3DNormalForwardMUSAKernelLauncher(const Tensor boxes, Tensor &keep,\n                                               Tensor &keep_num,\n                                               float nms_overlap_thresh);\n\nvoid iou3d_boxes_overlap_bev_forward_musa(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap) {\n  IoU3DBoxesOverlapBevForwardMUSAKernelLauncher(num_a, boxes_a, num_b, boxes_b,\n                                                ans_overlap);\n};\n\nvoid iou3d_nms3d_forward_musa(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh) {\n  IoU3DNMS3DForwardMUSAKernelLauncher(boxes, keep, keep_num,\n                                      nms_overlap_thresh);\n};\n\nvoid iou3d_nms3d_normal_forward_musa(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh) {\n  IoU3DNMS3DNormalForwardMUSAKernelLauncher(boxes, keep, keep_num,\n                                            nms_overlap_thresh);\n};\n\nvoid iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap);\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,\n                              Tensor &keep_num, float nms_overlap_thresh);\n\nvoid iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,\n                                     Tensor &keep_num,\n                                     float nms_overlap_thresh);\n\nREGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, MUSA,\n                     iou3d_boxes_overlap_bev_forward_musa);\nREGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MUSA, iou3d_nms3d_forward_musa);\nREGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, MUSA,\n                     iou3d_nms3d_normal_forward_musa);\n\nvoid KNNForwardMUSAKernelLauncher(int b, int n, int m, int nsample,\n                                  const Tensor xyz, const Tensor new_xyz,\n                                  Tensor idx, Tensor dist2);\n\nvoid knn_forward_musa(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  KNNForwardMUSAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);\n}\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2);\nREGISTER_DEVICE_IMPL(knn_forward_impl, MUSA, knn_forward_musa);\n\nvoid MaskedIm2colForwardMUSAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int kernel_h,\n                                           const int kernel_w, const int pad_h,\n                                           const int pad_w);\n\nvoid MaskedCol2imForwardMUSAKernelLauncher(const Tensor bottom_data,\n                                           const Tensor mask_h_idx,\n                                           const Tensor mask_w_idx,\n                                           Tensor top_data, const int height,\n                                           const int width, const int channels);\n\nvoid masked_im2col_forward_musa(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)\n  MaskedIm2colForwardMUSAKernelLauncher(im, mask_h_idx, mask_w_idx, col,\n                                        kernel_h, kernel_w, pad_h, pad_w);\n}\n\nvoid masked_col2im_forward_musa(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels) {\n  // im: (n, ic, h, w), kernel size (kh, kw)\n  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)\n  MaskedCol2imForwardMUSAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,\n                                        width, channels);\n}\n\nvoid masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor col,\n                                const int kernel_h, const int kernel_w,\n                                const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,\n                                const Tensor mask_w_idx, Tensor im, int height,\n                                int width, int channels);\n\nREGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MUSA,\n                     masked_im2col_forward_musa);\nREGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MUSA,\n                     masked_col2im_forward_musa);\n\nvoid modulated_deformable_im2col_musa(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_musa(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_musa(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nvoid modulated_deformable_im2col_impl(\n    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor data_col);\n\nvoid modulated_deformable_col2im_impl(\n    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,\n    const int batch_size, const int channels, const int height_im,\n    const int width_im, const int height_col, const int width_col,\n    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,\n    const int stride_h, const int stride_w, const int dilation_h,\n    const int dilation_w, const int deformable_group, Tensor grad_im);\n\nvoid modulated_deformable_col2im_coord_impl(\n    const Tensor data_col, const Tensor data_im, const Tensor data_offset,\n    const Tensor data_mask, const int batch_size, const int channels,\n    const int height_im, const int width_im, const int height_col,\n    const int width_col, const int kernel_h, const int kernel_w,\n    const int pad_h, const int pad_w, const int stride_h, const int stride_w,\n    const int dilation_h, const int dilation_w, const int deformable_group,\n    Tensor grad_offset, Tensor grad_mask);\n\nREGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, MUSA,\n                     modulated_deformable_im2col_musa);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, MUSA,\n                     modulated_deformable_col2im_musa);\nREGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, MUSA,\n                     modulated_deformable_col2im_coord_musa);\n\nTensor ms_deform_attn_musa_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_musa_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);\n\nTensor ms_deform_attn_impl_forward(const Tensor &value,\n                                   const Tensor &spatial_shapes,\n                                   const Tensor &level_start_index,\n                                   const Tensor &sampling_loc,\n                                   const Tensor &attn_weight,\n                                   const int im2col_step);\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);\n\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MUSA,\n                     ms_deform_attn_musa_forward);\nREGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MUSA,\n                     ms_deform_attn_musa_backward);\n\nTensor NMSMUSAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                             int offset);\n\nTensor nms_musa(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return NMSMUSAKernelLauncher(boxes, scores, iou_threshold, offset);\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\nREGISTER_DEVICE_IMPL(nms_impl, MUSA, nms_musa);\n\nvoid PointsInBoxesPartForwardMUSAKernelLauncher(int batch_size, int boxes_num,\n                                                int pts_num, const Tensor boxes,\n                                                const Tensor pts,\n                                                Tensor box_idx_of_points);\n\nvoid PointsInBoxesAllForwardMUSAKernelLauncher(int batch_size, int boxes_num,\n                                               int pts_num, const Tensor boxes,\n                                               const Tensor pts,\n                                               Tensor box_idx_of_points);\n\nvoid points_in_boxes_part_forward_musa(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points) {\n  PointsInBoxesPartForwardMUSAKernelLauncher(batch_size, boxes_num, pts_num,\n                                             boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_all_forward_musa(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points) {\n  PointsInBoxesAllForwardMUSAKernelLauncher(batch_size, boxes_num, pts_num,\n                                            boxes, pts, box_idx_of_points);\n};\n\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points);\n\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points);\nREGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, MUSA,\n                     points_in_boxes_part_forward_musa);\nREGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, MUSA,\n                     points_in_boxes_all_forward_musa);\n\nvoid PSAMaskForwardMUSAKernelLauncher(const int psa_type, const Tensor input,\n                                      Tensor output, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask);\n\nvoid PSAMaskBackwardMUSAKernelLauncher(\n    const int psa_type, const Tensor grad_output, Tensor grad_input,\n    const int num_, const int h_feature, const int w_feature, const int h_mask,\n    const int w_mask, const int half_h_mask, const int half_w_mask);\n\nvoid psamask_forward_musa(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  PSAMaskForwardMUSAKernelLauncher(psa_type, input, output, num_, h_feature,\n                                   w_feature, h_mask, w_mask, half_h_mask,\n                                   half_w_mask);\n}\n\nvoid psamask_backward_musa(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask) {\n  PSAMaskBackwardMUSAKernelLauncher(psa_type, grad_output, grad_input, num_,\n                                    h_feature, w_feature, h_mask, w_mask,\n                                    half_h_mask, half_w_mask);\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask);\nREGISTER_DEVICE_IMPL(psamask_forward_impl, MUSA, psamask_forward_musa);\nREGISTER_DEVICE_IMPL(psamask_backward_impl, MUSA, psamask_backward_musa);\n\nvoid ROIAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned);\n\nvoid ROIAlignBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                        Tensor argmax_y, Tensor argmax_x,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, int pool_mode,\n                                        bool aligned);\n\nvoid roi_align_forward_musa(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  ROIAlignForwardMUSAKernelLauncher(\n      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,\n      spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_musa(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  ROIAlignBackwardMUSAKernelLauncher(\n      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,\n      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_DEVICE_IMPL(roi_align_forward_impl, MUSA, roi_align_forward_musa);\nREGISTER_DEVICE_IMPL(roi_align_backward_impl, MUSA, roi_align_backward_musa);\n\nvoid ROIAlignRotatedForwardMUSAKernelLauncher(\n    const at::Tensor input, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor output);\n\nvoid ROIAlignRotatedBackwardMUSAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);\n\nvoid roi_align_rotated_forward_musa(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = input.size(1);\n  int data_height = input.size(2);\n  int data_width = input.size(3);\n  ROIAlignRotatedForwardMUSAKernelLauncher(\n      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, output);\n}\n\nvoid roi_align_rotated_backward_musa(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n\n  int num_channels = bottom_grad.size(1);\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  ROIAlignRotatedBackwardMUSAKernelLauncher(\n      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,\n      num_channels, data_height, data_width, num_rois, aligned_height,\n      aligned_width, bottom_grad);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\nREGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MUSA,\n                     roi_align_rotated_forward_musa);\nREGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MUSA,\n                     roi_align_rotated_backward_musa);\n\nvoid RiROIAlignRotatedForwardMUSAKernelLauncher(\n    const at::Tensor features, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor output);\n\nvoid RiROIAlignRotatedBackwardMUSAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor bottom_grad);\n\nvoid riroi_align_rotated_forward_musa(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(features);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = features.size(1) / num_orientations;\n  int data_height = features.size(2);\n  int data_width = features.size(3);\n  RiROIAlignRotatedForwardMUSAKernelLauncher(\n      features, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, output);\n}\n\nvoid riroi_align_rotated_backward_musa(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise) {\n  // Number of ROIs\n  int num_rois = rois.size(0);\n  int size_rois = rois.size(1);\n  if (size_rois != 6) {\n    AT_ERROR(\"wrong roi size\");\n  }\n  CHECK_CONTIGUOUS(top_grad);\n  CHECK_CONTIGUOUS(rois);\n  int num_channels = bottom_grad.size(1) / num_orientations;\n  int data_height = bottom_grad.size(2);\n  int data_width = bottom_grad.size(3);\n  RiROIAlignRotatedBackwardMUSAKernelLauncher(\n      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,\n      data_height, data_width, num_rois, pooled_height, pooled_width,\n      num_orientations, bottom_grad);\n}\n\nvoid riroi_align_rotated_forward_impl(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise);\n\nvoid riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise);\n\nREGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, MUSA,\n                     riroi_align_rotated_forward_musa);\nREGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, MUSA,\n                     riroi_align_rotated_backward_musa);\n\nvoid RoiawarePool3dForwardMUSAKernelLauncher(\n    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,\n    int out_y, int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,\n    Tensor pooled_features, int pool_method);\n\nvoid RoiawarePool3dBackwardMUSAKernelLauncher(\n    int boxes_num, int out_x, int out_y, int out_z, int channels,\n    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,\n    const Tensor grad_out, Tensor grad_in, int pool_method);\n\nvoid roiaware_pool3d_forward_musa(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method) {\n  RoiawarePool3dForwardMUSAKernelLauncher(\n      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,\n      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,\n      pool_method);\n};\n\nvoid roiaware_pool3d_backward_musa(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method) {\n  RoiawarePool3dBackwardMUSAKernelLauncher(\n      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,\n      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);\n};\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method);\n\nREGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MUSA,\n                     roiaware_pool3d_forward_musa);\nREGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MUSA,\n                     roiaware_pool3d_backward_musa);\n\nvoid RoIPointPool3dForwardMUSAKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);\n\nvoid roipoint_pool3d_forward_musa(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag) {\n  RoIPointPool3dForwardMUSAKernelLauncher(\n      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,\n      boxes3d, pts_feature, pooled_features, pooled_empty_flag);\n};\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag);\nREGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MUSA,\n                     roipoint_pool3d_forward_musa);\n\nvoid ROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax, int pooled_height,\n                                      int pooled_width, float spatial_scale);\n\nvoid ROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                       Tensor argmax, Tensor grad_input,\n                                       int pooled_height, int pooled_width,\n                                       float spatial_scale);\n\nvoid roi_pool_forward_musa(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale) {\n  ROIPoolForwardMUSAKernelLauncher(input, rois, output, argmax, pooled_height,\n                                   pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_musa(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale) {\n  ROIPoolBackwardMUSAKernelLauncher(grad_output, rois, argmax, grad_input,\n                                    pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\nREGISTER_DEVICE_IMPL(roi_pool_forward_impl, MUSA, roi_pool_forward_musa);\nREGISTER_DEVICE_IMPL(roi_pool_backward_impl, MUSA, roi_pool_backward_musa);\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\nstd::vector<at::Tensor> DynamicPointToVoxelForwardMUSAKernelLauncher(\n    const at::Tensor &feats, const at::Tensor &coors,\n    const reduce_t reduce_type);\n\nvoid DynamicPointToVoxelBackwardMUSAKernelLauncher(\n    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,\n    const at::Tensor &feats, const at::Tensor &reduced_feats,\n    const at::Tensor &coors_map, const at::Tensor &reduce_count,\n    const reduce_t reduce_type);\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_musa(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const reduce_t reduce_type) {\n  return DynamicPointToVoxelForwardMUSAKernelLauncher(feats, coors,\n                                                      reduce_type);\n};\n\nvoid dynamic_point_to_voxel_backward_musa(\n    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,\n    const torch::Tensor &feats, const torch::Tensor &reduced_feats,\n    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,\n    const reduce_t reduce_type) {\n  DynamicPointToVoxelBackwardMUSAKernelLauncher(grad_feats, grad_reduced_feats,\n                                                feats, reduced_feats, coors_idx,\n                                                reduce_count, reduce_type);\n};\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const reduce_t reduce_type);\n\nvoid dynamic_point_to_voxel_backward_impl(\n    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,\n    const torch::Tensor &feats, const torch::Tensor &reduced_feats,\n    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,\n    const reduce_t reduce_type);\n\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MUSA,\n                     dynamic_point_to_voxel_forward_musa);\nREGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MUSA,\n                     dynamic_point_to_voxel_backward_musa);\n\nvoid SyncBNForwardMeanMUSAKernelLauncher(const Tensor input, Tensor mean);\n\nvoid SyncBNForwardVarMUSAKernelLauncher(const Tensor input, const Tensor mean,\n                                        Tensor var);\n\nvoid SyncBNForwardOutputMUSAKernelLauncher(\n    const Tensor input, const Tensor mean, const Tensor var,\n    Tensor running_mean, Tensor running_var, const Tensor weight,\n    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,\n    float momentum, int group_size);\n\nvoid SyncBNBackwardParamMUSAKernelLauncher(const Tensor grad_output,\n                                           const Tensor norm,\n                                           Tensor grad_weight,\n                                           Tensor grad_bias);\n\nvoid SyncBNBackwardDataMUSAKernelLauncher(const Tensor grad_output,\n                                          const Tensor weight,\n                                          const Tensor grad_weight,\n                                          const Tensor grad_bias,\n                                          const Tensor norm, const Tensor std,\n                                          Tensor grad_input);\n\nvoid sync_bn_forward_mean_musa(const Tensor input, Tensor mean) {\n  SyncBNForwardMeanMUSAKernelLauncher(input, mean);\n}\n\nvoid sync_bn_forward_var_musa(const Tensor input, const Tensor mean,\n                              Tensor var) {\n  SyncBNForwardVarMUSAKernelLauncher(input, mean, var);\n}\n\nvoid sync_bn_forward_output_musa(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size) {\n  SyncBNForwardOutputMUSAKernelLauncher(input, mean, var, running_mean,\n                                        running_var, weight, bias, norm, std,\n                                        output, eps, momentum, group_size);\n}\n\nvoid sync_bn_backward_param_musa(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias) {\n  SyncBNBackwardParamMUSAKernelLauncher(grad_output, norm, grad_weight,\n                                        grad_bias);\n}\n\nvoid sync_bn_backward_data_musa(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input) {\n  SyncBNBackwardDataMUSAKernelLauncher(grad_output, weight, grad_weight,\n                                       grad_bias, norm, std, grad_input);\n}\n\nvoid sync_bn_forward_mean_impl(const Tensor input, Tensor mean);\n\nvoid sync_bn_forward_var_impl(const Tensor input, const Tensor mean,\n                              Tensor var);\n\nvoid sync_bn_forward_output_impl(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size);\n\nvoid sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias);\n\nvoid sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input);\n\nREGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, MUSA,\n                     sync_bn_forward_mean_musa);\nREGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, MUSA, sync_bn_forward_var_musa);\nREGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, MUSA,\n                     sync_bn_forward_output_musa);\nREGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, MUSA,\n                     sync_bn_backward_param_musa);\nREGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, MUSA,\n                     sync_bn_backward_data_musa);\n\nvoid ThreeInterpolateForwardMUSAKernelLauncher(int b, int c, int m, int n,\n                                               const Tensor points,\n                                               const Tensor idx,\n                                               const Tensor weight, Tensor out);\n\nvoid ThreeInterpolateBackwardMUSAKernelLauncher(int b, int c, int n, int m,\n                                                const Tensor grad_out,\n                                                const Tensor idx,\n                                                const Tensor weight,\n                                                Tensor grad_points);\n\nvoid three_interpolate_forward_musa(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out) {\n  ThreeInterpolateForwardMUSAKernelLauncher(b, c, m, n, points, idx, weight,\n                                            out);\n};\n\nvoid three_interpolate_backward_musa(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points) {\n  ThreeInterpolateBackwardMUSAKernelLauncher(b, c, n, m, grad_out, idx, weight,\n                                             grad_points);\n};\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out);\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points);\nREGISTER_DEVICE_IMPL(three_interpolate_forward_impl, MUSA,\n                     three_interpolate_forward_musa);\nREGISTER_DEVICE_IMPL(three_interpolate_backward_impl, MUSA,\n                     three_interpolate_backward_musa);\n\nvoid ThreeNNForwardMUSAKernelLauncher(int b, int n, int m, const Tensor unknown,\n                                      const Tensor known, Tensor dist2,\n                                      Tensor idx);\n\nvoid three_nn_forward_musa(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx) {\n  ThreeNNForwardMUSAKernelLauncher(b, n, m, unknown, known, dist2, idx);\n};\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx);\nREGISTER_DEVICE_IMPL(three_nn_forward_impl, MUSA, three_nn_forward_musa);\n\nvoid TINShiftForwardMUSAKernelLauncher(Tensor input, Tensor shift,\n                                       Tensor output);\n\nvoid TINShiftBackwardMUSAKernelLauncher(Tensor grad_output, Tensor shift,\n                                        Tensor grad_input);\n\nvoid tin_shift_forward_musa(Tensor input, Tensor shift, Tensor output) {\n  TINShiftForwardMUSAKernelLauncher(input, shift, output);\n}\n\nvoid tin_shift_backward_musa(Tensor grad_output, Tensor shift,\n                             Tensor grad_input) {\n  TINShiftBackwardMUSAKernelLauncher(grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input);\nREGISTER_DEVICE_IMPL(tin_shift_forward_impl, MUSA, tin_shift_forward_musa);\nREGISTER_DEVICE_IMPL(tin_shift_backward_impl, MUSA, tin_shift_backward_musa);\n\n#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))\ntorch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,\n                           int upy, int downx, int downy, int padx0, int padx1,\n                           int pady0, int pady1, bool flip, float gain);\n\ntorch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,\n                                int upx, int upy, int downx, int downy,\n                                int padx0, int padx1, int pady0, int pady1,\n                                bool flip, float gain);\nREGISTER_DEVICE_IMPL(upfirdn2d_op_impl, MUSA, upfirdn2d_op);\n#endif\n\nint HardVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nint NondeterministicHardVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3);\n\nvoid DynamicVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &coors,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const int NDim = 3);\n\nint hard_voxelize_forward_musa(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim) {\n  return HardVoxelizeForwardMUSAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nint nondeterministic_hard_voxelize_forward_musa(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim) {\n  return NondeterministicHardVoxelizeForwardMUSAKernelLauncher(\n      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,\n      max_points, max_voxels, NDim);\n};\n\nvoid dynamic_voxelize_forward_musa(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim) {\n  DynamicVoxelizeForwardMUSAKernelLauncher(points, coors, voxel_size,\n                                           coors_range, NDim);\n};\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim);\n\nint nondeterministic_hard_voxelize_forward_impl(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim);\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim);\n\nREGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MUSA,\n                     hard_voxelize_forward_musa);\nREGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, MUSA,\n                     nondeterministic_hard_voxelize_forward_musa);\nREGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, MUSA,\n                     dynamic_voxelize_forward_musa);\n\nvoid RotatedFeatureAlignForwardMUSAKernelLauncher(const Tensor features,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor output);\n\nvoid RotatedFeatureAlignBackwardMUSAKernelLauncher(const Tensor top_grad,\n                                                   const Tensor best_bboxes,\n                                                   const float spatial_scale,\n                                                   const int points,\n                                                   Tensor bottom_grad);\n\nvoid rotated_feature_align_forward_musa(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output) {\n  RotatedFeatureAlignForwardMUSAKernelLauncher(features, best_bboxes,\n                                               spatial_scale, points, output);\n};\n\nvoid rotated_feature_align_backward_musa(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad) {\n  RotatedFeatureAlignBackwardMUSAKernelLauncher(\n      top_grad, best_bboxes, spatial_scale, points, bottom_grad);\n};\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nREGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MUSA,\n                     rotated_feature_align_forward_musa);\nREGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MUSA,\n                     rotated_feature_align_backward_musa);\n\nvoid PointsInPolygonsForwardMUSAKernelLauncher(const at::Tensor points,\n                                               const at::Tensor polygons,\n                                               const int rows, const int cols,\n                                               at::Tensor output);\n\nvoid points_in_polygons_forward_musa(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols) {\n  PointsInPolygonsForwardMUSAKernelLauncher(points, polygons, rows, cols,\n                                            output);\n};\n\nvoid points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols);\n\nREGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, MUSA,\n                     points_in_polygons_forward_musa);\n\ntorch::Tensor IndiceMaxpoolForwardMUSAKernelLauncher(torch::Tensor features,\n                                                     torch::Tensor indicePairs,\n                                                     torch::Tensor indiceNum,\n                                                     int64_t numAct);\n\ntorch::Tensor indice_maxpool_forward_musa(torch::Tensor features,\n                                          torch::Tensor indicePairs,\n                                          torch::Tensor indiceNum,\n                                          int64_t numAct) {\n  return IndiceMaxpoolForwardMUSAKernelLauncher(features, indicePairs,\n                                                indiceNum, numAct);\n};\n\ntorch::Tensor indice_maxpool_forward_impl(torch::Tensor features,\n                                          torch::Tensor indicePairs,\n                                          torch::Tensor indiceNum,\n                                          int64_t numAct);\nREGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, MUSA,\n                     indice_maxpool_forward_musa);\n\ntorch::Tensor IndiceMaxpoolBackwardMUSAKernelLauncher(torch::Tensor features,\n                                                      torch::Tensor outFeatures,\n                                                      torch::Tensor outGrad,\n                                                      torch::Tensor indicePairs,\n                                                      torch::Tensor indiceNum);\n\ntorch::Tensor indice_maxpool_backward_musa(torch::Tensor features,\n                                           torch::Tensor outFeatures,\n                                           torch::Tensor outGrad,\n                                           torch::Tensor indicePairs,\n                                           torch::Tensor indiceNum) {\n  return IndiceMaxpoolBackwardMUSAKernelLauncher(features, outFeatures, outGrad,\n                                                 indicePairs, indiceNum);\n};\n\ntorch::Tensor indice_maxpool_backward_impl(torch::Tensor features,\n                                           torch::Tensor outFeatures,\n                                           torch::Tensor outGrad,\n                                           torch::Tensor indicePairs,\n                                           torch::Tensor indiceNum);\n\nREGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, MUSA,\n                     indice_maxpool_backward_musa)\n\ntorch::Tensor IndiceConvForwardMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,\n    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,\n    int64_t _subM);\n\ntorch::Tensor indice_conv_forward_musa(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM) {\n  return IndiceConvForwardMUSAKernelLauncher(\n      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);\n};\n\ntorch::Tensor indice_conv_forward_impl(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM);\n\nREGISTER_DEVICE_IMPL(indice_conv_forward_impl, MUSA, indice_conv_forward_musa);\n\nstd::vector<torch::Tensor> IndiceConvBackwardMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM);\n\nstd::vector<torch::Tensor> indice_conv_backward_musa(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  return IndiceConvBackwardMUSAKernelLauncher(\n      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);\n};\n\nstd::vector<torch::Tensor> indice_conv_backward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM);\n\nREGISTER_DEVICE_IMPL(indice_conv_backward_impl, MUSA,\n                     indice_conv_backward_musa);\n\ntorch::Tensor FusedIndiceConvBatchnormMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM);\n\ntorch::Tensor fused_indice_conv_batchnorm_forward_musa(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM) {\n  return FusedIndiceConvBatchnormMUSAKernelLauncher(features, filters, bias,\n                                                    indicePairs, indiceNum,\n                                                    numActOut, _inverse, _subM);\n};\n\ntorch::Tensor fused_indice_conv_batchnorm_forward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,\n    int64_t _inverse, int64_t _subM);\n\nREGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, MUSA,\n                     fused_indice_conv_batchnorm_forward_musa)\n\nvoid MinAreaPolygonsMUSAKernelLauncher(const Tensor pointsets, Tensor polygons);\n\nvoid min_area_polygons_musa(const Tensor pointsets, Tensor polygons) {\n  MinAreaPolygonsMUSAKernelLauncher(pointsets, polygons);\n}\n\nvoid min_area_polygons_impl(const Tensor pointsets, Tensor polygons);\n\nREGISTER_DEVICE_IMPL(min_area_polygons_impl, MUSA, min_area_polygons_musa);\n\nvoid ActiveRotatedFilterForwardMUSAKernelLauncher(const Tensor input,\n                                                  const Tensor indices,\n                                                  Tensor output);\n\nvoid ActiveRotatedFilterBackwardMUSAKernelLauncher(const Tensor grad_out,\n                                                   const Tensor indices,\n                                                   Tensor grad_in);\n\nvoid active_rotated_filter_forward_musa(const Tensor input,\n                                        const Tensor indices, Tensor output) {\n  ActiveRotatedFilterForwardMUSAKernelLauncher(input, indices, output);\n};\n\nvoid active_rotated_filter_backward_musa(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in) {\n  ActiveRotatedFilterBackwardMUSAKernelLauncher(grad_out, indices, grad_in);\n};\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output);\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in);\n\nREGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, MUSA,\n                     active_rotated_filter_forward_musa);\nREGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, MUSA,\n                     active_rotated_filter_backward_musa);\n\nvoid ConvexIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                 Tensor ious);\n\nvoid ConvexGIoUMUSAKernelLauncher(const Tensor pointsets, const Tensor polygons,\n                                  Tensor output);\n\nvoid convex_iou_musa(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious) {\n  ConvexIoUMUSAKernelLauncher(pointsets, polygons, ious);\n}\n\nvoid convex_giou_musa(const Tensor pointsets, const Tensor polygons,\n                      Tensor output) {\n  ConvexGIoUMUSAKernelLauncher(pointsets, polygons, output);\n}\n\nvoid convex_iou_impl(const Tensor pointsets, const Tensor polygons,\n                     Tensor ious);\n\nvoid convex_giou_impl(const Tensor pointsets, const Tensor polygons,\n                      Tensor output);\n\nREGISTER_DEVICE_IMPL(convex_iou_impl, MUSA, convex_iou_musa);\nREGISTER_DEVICE_IMPL(convex_giou_impl, MUSA, convex_giou_musa);\n\nTensor DiffIoURotatedSortVerticesMUSAKernelLauncher(Tensor vertices,\n                                                    Tensor mask,\n                                                    Tensor num_valid);\n\nTensor diff_iou_rotated_sort_vertices_forward_musa(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid) {\n  return DiffIoURotatedSortVerticesMUSAKernelLauncher(vertices, mask,\n                                                      num_valid);\n}\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\nREGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MUSA,\n                     diff_iou_rotated_sort_vertices_forward_musa);\n\n#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))\nvoid ChamferDistanceForwardMUSAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,\n    const Tensor dist2, const Tensor idx1, const Tensor idx2);\n#endif\n\nvoid ChamferDistanceBackwardMUSAKernelLauncher(\n    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,\n    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);\n\n#if ((!defined(MUSA_ARCH)) || (defined(MUSA_ARCH)) && (MUSA_ARCH > 21))\nvoid chamfer_distance_forward_musa(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2) {\n  ChamferDistanceForwardMUSAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,\n                                           idx2);\n};\n\nvoid chamfer_distance_backward_musa(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2) {\n  ChamferDistanceBackwardMUSAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,\n                                            graddist2, gradxyz1, gradxyz2);\n};\n\nvoid chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,\n                                   const Tensor dist1, const Tensor dist2,\n                                   const Tensor idx1, const Tensor idx2);\n\nvoid chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,\n                                    Tensor idx1, Tensor idx2, Tensor graddist1,\n                                    Tensor graddist2, Tensor gradxyz1,\n                                    Tensor gradxyz2);\n\nREGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, MUSA,\n                     chamfer_distance_forward_musa);\nREGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, MUSA,\n                     chamfer_distance_backward_musa);\n#endif\n\nvoid PrROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                        Tensor output, int pooled_height,\n                                        int pooled_width, float spatial_scale);\n\nvoid PrROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                         Tensor grad_input, int pooled_height,\n                                         int pooled_width, float spatial_scale);\n\nvoid PrROIPoolCoorBackwardMUSAKernelLauncher(\n    Tensor output, Tensor grad_output, Tensor input, Tensor rois,\n    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);\n\nvoid prroi_pool_forward_musa(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale) {\n  PrROIPoolForwardMUSAKernelLauncher(input, rois, output, pooled_height,\n                                     pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_backward_musa(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  PrROIPoolBackwardMUSAKernelLauncher(grad_output, rois, grad_input,\n                                      pooled_height, pooled_width,\n                                      spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_musa(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale) {\n  PrROIPoolCoorBackwardMUSAKernelLauncher(output, grad_output, input, rois,\n                                          grad_rois, pooled_height,\n                                          pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale);\nvoid prroi_pool_backward_impl(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale);\nvoid prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale);\nREGISTER_DEVICE_IMPL(prroi_pool_forward_impl, MUSA, prroi_pool_forward_musa);\nREGISTER_DEVICE_IMPL(prroi_pool_backward_impl, MUSA, prroi_pool_backward_musa);\nREGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, MUSA,\n                     prroi_pool_coor_backward_musa);\n\nvoid BezierAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                          Tensor output, int aligned_height,\n                                          int aligned_width,\n                                          float spatial_scale,\n                                          int sampling_ratio, bool aligned);\n\nvoid BezierAlignBackwardMUSAKernelLauncher(\n    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,\n    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);\n\nvoid bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned);\n\nvoid bezier_align_backward_impl(Tensor grad_output, Tensor rois,\n                                Tensor grad_input, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned);\n\nREGISTER_DEVICE_IMPL(bezier_align_forward_impl, MUSA,\n                     BezierAlignForwardMUSAKernelLauncher);\nREGISTER_DEVICE_IMPL(bezier_align_backward_impl, MUSA,\n                     BezierAlignBackwardMUSAKernelLauncher);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/nms_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"nms_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nTensor NMSMUSAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,\n                             int offset) {\n  c10::musa::MUSAGuard device_guard(boxes.device());\n\n  if (boxes.numel() == 0) {\n    return at::empty({0}, boxes.options().dtype(at::kLong));\n  }\n  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));\n  auto boxes_sorted = boxes.index_select(0, order_t);\n\n  int boxes_num = boxes.size(0);\n  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;\n  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);\n  Tensor mask =\n      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));\n  dim3 blocks(col_blocks_alloc, col_blocks_alloc);\n  dim3 threads(threadsPerBlock);\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  nms_musa<<<blocks, threads, 0, stream>>>(\n      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),\n      (unsigned long long*)mask.data_ptr<int64_t>());\n\n  // Filter the boxes which should be kept.\n  at::Tensor keep_t = at::zeros(\n      {boxes_num}, boxes.options().dtype(at::kBool).device(::at::musa::kMUSA));\n  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),\n                          col_blocks * sizeof(unsigned long long), stream>>>(\n      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),\n      boxes_num);\n  AT_MUSA_CHECK(musaGetLastError());\n  return order_t.masked_select(keep_t);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/nms_quadri_musa.mu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"nms_quadri_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nTensor nms_quadri_musa(const Tensor dets, const Tensor scores,\n                       const Tensor order_t, const Tensor dets_sorted,\n                       float iou_threshold, const int multi_label) {\n  // using scalar_t = float;\n  AT_ASSERTM(dets.is_privateuseone(), \"dets must be a MUSA tensor\");\n  AT_ASSERTM(scores.is_privateuseone(), \"scores must be a MUSA tensor\");\n  c10::musa::MUSAGuard device_guard(dets.device());\n\n  int dets_num = dets.size(0);\n\n  const int col_blocks = at::musa::ATenCeilDiv(dets_num, threadsPerBlock);\n\n  Tensor mask =\n      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));\n\n  dim3 blocks(col_blocks, col_blocks);\n  dim3 threads(threadsPerBlock);\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  AT_DISPATCH_FLOATING_TYPES(\n      dets_sorted.scalar_type(), \"nms_quadri_kernel_musa\", [&] {\n        nms_quadri_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),\n            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);\n      });\n\n  Tensor mask_cpu = mask.to(at::kCPU);\n  unsigned long long* mask_host =\n      (unsigned long long*)mask_cpu.data_ptr<int64_t>();\n\n  std::vector<unsigned long long> remv(col_blocks);\n  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);\n\n  Tensor keep =\n      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));\n  int64_t* keep_out = keep.data_ptr<int64_t>();\n\n  int num_to_keep = 0;\n  for (int i = 0; i < dets_num; i++) {\n    int nblock = i / threadsPerBlock;\n    int inblock = i % threadsPerBlock;\n\n    if (!(remv[nblock] & (1ULL << inblock))) {\n      keep_out[num_to_keep++] = i;\n      unsigned long long* p = mask_host + i * col_blocks;\n      for (int j = nblock; j < col_blocks; j++) {\n        remv[j] |= p[j];\n      }\n    }\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n  return order_t.index(\n      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)\n           .to(order_t.device(), keep.scalar_type())});\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/nms_rotated_musa.mu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_musa.cu\n#include \"nms_rotated_musa.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nTensor nms_rotated_musa(const Tensor dets, const Tensor scores,\n                        const Tensor order_t, const Tensor dets_sorted,\n                        float iou_threshold, const int multi_label) {\n  // using scalar_t = float;\n  AT_ASSERTM(dets.is_privateuseone(), \"dets must be a MUSA tensor\");\n  AT_ASSERTM(scores.is_privateuseone(), \"scores must be a MUSA tensor\");\n  c10::musa::MUSAGuard device_guard(dets.device());\n\n  int dets_num = dets.size(0);\n\n  const int col_blocks = at::musa::ATenCeilDiv(dets_num, threadsPerBlock);\n\n  Tensor mask =\n      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));\n\n  dim3 blocks(col_blocks, col_blocks);\n  dim3 threads(threadsPerBlock);\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  AT_DISPATCH_FLOATING_TYPES(\n      dets_sorted.scalar_type(), \"nms_rotated_kernel_musa\", [&] {\n        nms_rotated_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),\n            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);\n      });\n\n  Tensor mask_cpu = mask.to(at::kCPU);\n  unsigned long long* mask_host =\n      (unsigned long long*)mask_cpu.data_ptr<int64_t>();\n\n  std::vector<unsigned long long> remv(col_blocks);\n  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);\n\n  Tensor keep =\n      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));\n  int64_t* keep_out = keep.data_ptr<int64_t>();\n\n  int num_to_keep = 0;\n  for (int i = 0; i < dets_num; i++) {\n    int nblock = i / threadsPerBlock;\n    int inblock = i % threadsPerBlock;\n\n    if (!(remv[nblock] & (1ULL << inblock))) {\n      keep_out[num_to_keep++] = i;\n      unsigned long long* p = mask_host + i * col_blocks;\n      for (int j = nblock; j < col_blocks; j++) {\n        remv[j] |= p[j];\n      }\n    }\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n  return order_t.index(\n      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)\n           .to(order_t.device(), keep.scalar_type())});\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/points_in_boxes_musa.mu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu\n// Written by Shaoshuai Shi\n// All Rights Reserved 2019.\n\n#include <stdio.h>\n\n#include \"points_in_boxes_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid PointsInBoxesPartForwardMUSAKernelLauncher(int batch_size, int boxes_num,\n                                                int pts_num, const Tensor boxes,\n                                                const Tensor pts,\n                                                Tensor box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is\n  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,\n  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default\n  // -1\n\n  c10::musa::MUSAGuard device_guard(boxes.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      boxes.scalar_type(), \"points_in_boxes_part_forward_musa_kernel\", [&] {\n        points_in_boxes_part_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),\n                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid PointsInBoxesAllForwardMUSAKernelLauncher(int batch_size, int boxes_num,\n                                               int pts_num, const Tensor boxes,\n                                               const Tensor pts,\n                                               Tensor box_idx_of_points) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)\n  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),\n  // default -1\n\n  c10::musa::MUSAGuard device_guard(boxes.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      boxes.scalar_type(), \"points_in_boxes_all_forward_musa_kernel\", [&] {\n        points_in_boxes_all_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),\n                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/points_in_polygons_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/ming71/MUSA/blob/master/point_justify/points_justify_kernel.cu\n\n#include <stdio.h>\n\n#include \"points_in_polygons_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid PointsInPolygonsForwardMUSAKernelLauncher(const at::Tensor points,\n                                               const at::Tensor polygons,\n                                               const int rows, const int cols,\n                                               at::Tensor output) {\n  const int output_size = rows * cols;\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      points.scalar_type(), \"points_in_polygons_forward_musa_kernel\", ([&] {\n        const scalar_t *vertex1 = points.data_ptr<scalar_t>();\n        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();\n        scalar_t *inside_flag = output.data_ptr<scalar_t>();\n\n        points_in_polygons_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, vertex1, vertex2, rows, cols, inside_flag);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/prroi_pool_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"prroi_pool_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid PrROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois,\n                                        Tensor output, int pooled_height,\n                                        int pooled_width, float spatial_scale) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  prroi_pool_forward_musa_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),\n          output.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid PrROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                         Tensor grad_input, int pooled_height,\n                                         int pooled_width,\n                                         float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  prroi_pool_backward_musa_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),\n          grad_input.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid PrROIPoolCoorBackwardMUSAKernelLauncher(Tensor output, Tensor grad_output,\n                                             Tensor input, Tensor rois,\n                                             Tensor grad_rois,\n                                             int pooled_height,\n                                             int pooled_width,\n                                             float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  prroi_pool_coor_backward_musa_kernel<float>\n      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),\n          input.data_ptr<float>(), rois.data_ptr<float>(),\n          grad_rois.data_ptr<float>(), pooled_height, pooled_width,\n          static_cast<float>(spatial_scale), channels, height, width);\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/psamask_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/hszhao/semseg/blob/master/lib/psa/src\n\n#include <torch/serialize/tensor.h>\n\n#include \"psamask_musa_kernel.muh\"\n#include \"pytorch_musa_helper.hpp\"\n\nvoid PSAMaskForwardMUSAKernelLauncher(const int psa_type, const Tensor input,\n                                      Tensor output, const int num_,\n                                      const int h_feature, const int w_feature,\n                                      const int h_mask, const int w_mask,\n                                      const int half_h_mask,\n                                      const int half_w_mask) {\n  int nthreads = num_ * h_feature * w_feature;\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  if (psa_type == 0)\n    AT_DISPATCH_FLOATING_TYPES(\n        input.scalar_type(), \"psamask_collect_forward_musa\", [&] {\n          psamask_collect_forward_musa<scalar_t><<<nthreads, 512, 0, stream>>>(\n              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n              half_w_mask, input.data_ptr<scalar_t>(),\n              output.data_ptr<scalar_t>());\n        });\n  else\n    AT_DISPATCH_FLOATING_TYPES(\n        input.scalar_type(), \"psamask_distribute_forward_musa\", [&] {\n          psamask_distribute_forward_musa<scalar_t>\n              <<<nthreads, 512, 0, stream>>>(\n                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                  half_w_mask, input.data_ptr<scalar_t>(),\n                  output.data_ptr<scalar_t>());\n        });\n}\n\nvoid PSAMaskBackwardMUSAKernelLauncher(\n    const int psa_type, const Tensor grad_output, Tensor grad_input,\n    const int num_, const int h_feature, const int w_feature, const int h_mask,\n    const int w_mask, const int half_h_mask, const int half_w_mask) {\n  int nthreads = num_ * h_feature * w_feature;\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  if (psa_type == 0)\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_input.scalar_type(), \"psamask_collect_backward_musa\", [&] {\n          psamask_collect_backward_musa<scalar_t><<<nthreads, 512, 0, stream>>>(\n              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n              half_w_mask, grad_output.data_ptr<scalar_t>(),\n              grad_input.data_ptr<scalar_t>());\n        });\n  else\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_input.scalar_type(), \"psamask_distribute_backward_musa\", [&] {\n          psamask_distribute_backward_musa<scalar_t>\n              <<<nthreads, 512, 0, stream>>>(\n                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                  half_w_mask, grad_output.data_ptr<scalar_t>(),\n                  grad_input.data_ptr<scalar_t>());\n        });\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/riroi_align_rotated_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"riroi_align_rotated_musa_kernel.muh\"\n\nvoid RiROIAlignRotatedForwardMUSAKernelLauncher(\n    const at::Tensor features, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor output) {\n  const int output_size =\n      num_rois * pooled_height * pooled_width * channels * num_orientations;\n  c10::musa::MUSAGuard device_guard(features.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      features.scalar_type(), \"riroi_align_rotated_forward_musa_kernel\", ([&] {\n        const scalar_t *bottom_data = features.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n\n        riroi_align_rotated_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, bottom_data, rois_data, scalar_t(spatial_scale),\n                num_samples, clockwise, channels, height, width, pooled_height,\n                pooled_width, num_orientations, top_data);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid RiROIAlignRotatedBackwardMUSAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int num_samples, const bool clockwise, const int channels,\n    const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, const int num_orientations,\n    at::Tensor bottom_grad) {\n  const int output_size =\n      num_rois * pooled_height * pooled_width * channels * num_orientations;\n  c10::musa::MUSAGuard device_guard(top_grad.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      top_grad.scalar_type(), \"riroi_align_rotated_backward_musa_kernel\", ([&] {\n        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();\n        riroi_align_rotated_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, top_diff, rois_data, spatial_scale, num_samples,\n                clockwise, channels, height, width, pooled_height, pooled_width,\n                num_orientations, bottom_diff);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/roi_align_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"roi_align_musa_kernel.muh\"\n\nvoid ROIAlignForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                       Tensor argmax_y, Tensor argmax_x,\n                                       int aligned_height, int aligned_width,\n                                       float spatial_scale, int sampling_ratio,\n                                       int pool_mode, bool aligned) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"roi_align_forward_musa_kernel\", [&] {\n        roi_align_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,\n                aligned, channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ROIAlignBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                        Tensor argmax_y, Tensor argmax_x,\n                                        Tensor grad_input, int aligned_height,\n                                        int aligned_width, float spatial_scale,\n                                        int sampling_ratio, int pool_mode,\n                                        bool aligned) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"roi_align_backward_musa_kernel\", [&] {\n        roi_align_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),\n                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),\n                aligned_height, aligned_width,\n                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,\n                aligned, channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/roi_align_rotated_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"roi_align_rotated_musa_kernel.muh\"\n\nvoid ROIAlignRotatedForwardMUSAKernelLauncher(\n    const at::Tensor input, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor output) {\n  const int output_size = num_rois * pooled_height * pooled_width * channels;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"ROIAlignRotatedLaucherForward\", ([&] {\n        const scalar_t *bottom_data = input.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *top_data = output.data_ptr<scalar_t>();\n\n        roi_align_rotated_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(\n                output_size, bottom_data, rois_data, scalar_t(spatial_scale),\n                sampling_ratio, aligned, clockwise, channels, height, width,\n                pooled_height, pooled_width, top_data);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ROIAlignRotatedBackwardMUSAKernelLauncher(\n    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,\n    const int sampling_ratio, const bool aligned, const bool clockwise,\n    const int channels, const int height, const int width, const int num_rois,\n    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {\n  const int output_size = num_rois * pooled_height * pooled_width * channels;\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      top_grad.scalar_type(), \"ROIAlignLaucherBackward\", ([&] {\n        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t *rois_data = rois.data_ptr<scalar_t>();\n        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();\n        roi_align_rotated_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(\n                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,\n                aligned, clockwise, channels, height, width, pooled_height,\n                pooled_width, bottom_diff);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/roi_pool_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"roi_pool_musa_kernel.muh\"\n\nvoid ROIPoolForwardMUSAKernelLauncher(Tensor input, Tensor rois, Tensor output,\n                                      Tensor argmax, int pooled_height,\n                                      int pooled_width, float spatial_scale) {\n  int output_size = output.numel();\n  int channels = input.size(1);\n  int height = input.size(2);\n  int width = input.size(3);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"roi_pool_forward_musa_kernel\", [&] {\n        roi_pool_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),\n                argmax.data_ptr<int>(), pooled_height, pooled_width,\n                static_cast<scalar_t>(spatial_scale), channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ROIPoolBackwardMUSAKernelLauncher(Tensor grad_output, Tensor rois,\n                                       Tensor argmax, Tensor grad_input,\n                                       int pooled_height, int pooled_width,\n                                       float spatial_scale) {\n  int output_size = grad_output.numel();\n  int channels = grad_input.size(1);\n  int height = grad_input.size(2);\n  int width = grad_input.size(3);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"roi_pool_backward_musa_kernel\", [&] {\n        roi_pool_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),\n                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,\n                channels, height, width);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/roiaware_pool3d_musa.mu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu\n// Written by Shaoshuai Shi\n// All Rights Reserved 2019.\n\n#include <stdio.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"roiaware_pool3d_musa_kernel.muh\"\n\nvoid RoiawarePool3dForwardMUSAKernelLauncher(\n    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,\n    int out_y, int out_z, const Tensor rois, const Tensor pts,\n    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,\n    Tensor pooled_features, int pool_method) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params\n  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params\n  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params\n  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:\n  // max_pool 1: avg_pool\n\n  c10::musa::MUSAGuard device_guard(pts_feature.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  Tensor pts_mask =\n      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));\n\n  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      rois.scalar_type(), \"generate_pts_mask_for_box3d\", [&] {\n        generate_pts_mask_for_box3d<scalar_t>\n            <<<blocks_mask, threads, 0, stream>>>(\n                boxes_num, pts_num, out_x, out_y, out_z,\n                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),\n                pts_mask.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  // TODO: Merge the collect and pool functions, SS\n\n  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));\n\n  AT_DISPATCH_INTEGRAL_TYPES(\n      pts_idx_of_voxels.scalar_type(), \"collect_inside_pts_for_box3d\", [&] {\n        collect_inside_pts_for_box3d<scalar_t>\n            <<<blocks_collect, threads, 0, stream>>>(\n                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,\n                pts_mask.data_ptr<int>(),\n                pts_idx_of_voxels.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),\n                   channels, boxes_num);\n  if (pool_method == 0) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        pts_feature.scalar_type(), \"roiaware_maxpool3d\", [&] {\n          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,\n              out_z, pts_feature.data_ptr<scalar_t>(),\n              pts_idx_of_voxels.data_ptr<int>(),\n              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());\n        });\n  } else if (pool_method == 1) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        pts_feature.scalar_type(), \"roiaware_avgpool3d\", [&] {\n          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,\n              out_z, pts_feature.data_ptr<scalar_t>(),\n              pts_idx_of_voxels.data_ptr<int>(),\n              pooled_features.data_ptr<scalar_t>());\n        });\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid RoiawarePool3dBackwardMUSAKernelLauncher(\n    int boxes_num, int out_x, int out_y, int out_z, int channels,\n    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,\n    const Tensor grad_out, Tensor grad_in, int pool_method) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n  // params pool_method: 0: max_pool, 1: avg_pool\n\n  c10::musa::MUSAGuard device_guard(grad_out.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,\n              boxes_num);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  if (pool_method == 0) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        grad_in.scalar_type(), \"roiaware_maxpool3d_backward\", [&] {\n          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(\n              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),\n              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());\n        });\n  } else if (pool_method == 1) {\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n        grad_in.scalar_type(), \"roiaware_avgpool3d_backward\", [&] {\n          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(\n              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,\n              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),\n              grad_in.data_ptr<scalar_t>());\n        });\n  }\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/roipoint_pool3d_musa.mu",
    "content": "/*\nModified from\nhttps://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu\nPoint cloud feature pooling\nWritten by Shaoshuai Shi\nAll Rights Reserved 2018.\n*/\n\n#include <math.h>\n#include <stdio.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"roipoint_pool3d_musa_kernel.muh\"\n\nvoid RoIPointPool3dForwardMUSAKernelLauncher(\n    int batch_size, int pts_num, int boxes_num, int feature_in_len,\n    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,\n    const Tensor pts_feature, Tensor pooled_features,\n    Tensor pooled_empty_flag) {\n  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},\n                                boxes3d.options().dtype(at::kInt));\n\n  c10::musa::MUSAGuard device_guard(xyz.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz.scalar_type(), \"assign_pts_to_box3d\", [&] {\n        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(\n            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),\n            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());\n      });\n\n  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},\n                             boxes3d.options().dtype(at::kInt));\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);\n\n  get_pooled_idx<<<blocks2, threads, 0, stream>>>(\n      batch_size, pts_num, boxes_num, sampled_pts_num,\n      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),\n      pooled_empty_flag.data_ptr<int>());\n\n  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,\n                   batch_size);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      xyz.scalar_type(), \"roipoint_pool3d_forward\", [&] {\n        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(\n            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,\n            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),\n            pts_feature.data_ptr<scalar_t>(),\n            pooled_features.data_ptr<scalar_t>(),\n            pooled_empty_flag.data_ptr<int>());\n      });\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/rotated_feature_align_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu\n#include \"pytorch_musa_helper.hpp\"\n#include \"rotated_feature_align_musa_kernel.muh\"\n\nvoid RotatedFeatureAlignForwardMUSAKernelLauncher(const Tensor features,\n                                                  const Tensor best_bboxes,\n                                                  const float spatial_scale,\n                                                  const int points,\n                                                  Tensor output) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  const int output_size = features.numel();\n  AT_DISPATCH_FLOATING_TYPES(\n      features.scalar_type(), \"rotated_feature_align_forward_musa_kernel\",\n      ([&] {\n        const scalar_t* bottom_data = features.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* top_data = output.data_ptr<scalar_t>();\n\n        rotated_feature_align_forward_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, points, bottom_data, bboxes_data,\n                scalar_t(spatial_scale), features.size(1), features.size(2),\n                features.size(3), top_data);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid RotatedFeatureAlignBackwardMUSAKernelLauncher(const Tensor top_grad,\n                                                   const Tensor best_bboxes,\n                                                   const float spatial_scale,\n                                                   const int points,\n                                                   Tensor bottom_grad) {\n  c10::musa::MUSAGuard device_guard(top_grad.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  const int output_size = top_grad.numel();\n  AT_DISPATCH_FLOATING_TYPES(\n      top_grad.scalar_type(), \"rotated_feature_align_backward_musa_kernel\",\n      ([&] {\n        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();\n        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();\n        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();\n\n        rotated_feature_align_backward_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, points, top_diff, bboxes_data,\n                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),\n                top_grad.size(3), bottom_diff);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/scatter_points_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <stdio.h>\n#include <stdlib.h>\n#include <torch/types.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"scatter_points_musa_kernel.muh\"\n\nstd::vector<at::Tensor> DynamicPointToVoxelForwardMUSAKernelLauncher(\n    const at::Tensor &feats, const at::Tensor &coors,\n    const reduce_t reduce_type) {\n  const int num_input = feats.size(0);\n  const int num_feats = feats.size(1);\n\n  if (num_input == 0)\n    return {feats.clone().detach(), coors.clone().detach(),\n            coors.new_empty({0}, torch::kInt32),\n            coors.new_empty({0}, torch::kInt32)};\n\n  at::Tensor out_coors;\n  at::Tensor coors_map;\n  at::Tensor reduce_count;\n\n  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);\n\n  std::tie(out_coors, coors_map, reduce_count) =\n      at::unique_dim(coors_clean, 0, true, true, true);\n\n  if (out_coors[0][0].lt(0).item<bool>()) {\n    // the first element of out_coors (-1,-1,-1) and should be removed\n    out_coors = out_coors.slice(0, 1);\n    reduce_count = reduce_count.slice(0, 1);\n    coors_map = coors_map - 1;\n  }\n\n  coors_map = coors_map.to(torch::kInt32);\n  reduce_count = reduce_count.to(torch::kInt32);\n\n  auto reduced_feats =\n      at::empty({out_coors.size(0), num_feats}, feats.options());\n\n  c10::musa::MUSAGuard device_guard(feats.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  AT_DISPATCH_FLOATING_TYPES(\n      feats.scalar_type(), \"feats_reduce_kernel\", ([&] {\n        if (reduce_type == reduce_t::MAX)\n          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());\n        else\n          reduced_feats.fill_(static_cast<scalar_t>(0));\n\n        dim3 blocks(std::min(\n            at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n        dim3 threads(THREADS_PER_BLOCK);\n        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(\n            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),\n            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,\n            reduce_type);\n        if (reduce_type == reduce_t::MEAN)\n          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  return {reduced_feats, out_coors, coors_map, reduce_count};\n}\n\nvoid DynamicPointToVoxelBackwardMUSAKernelLauncher(\n    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,\n    const at::Tensor &feats, const at::Tensor &reduced_feats,\n    const at::Tensor &coors_map, const at::Tensor &reduce_count,\n    const reduce_t reduce_type) {\n  const int num_input = feats.size(0);\n  const int num_reduced = reduced_feats.size(0);\n  const int num_feats = feats.size(1);\n\n  grad_feats.fill_(0);\n  // copy voxel grad to points\n\n  if (num_input == 0 || num_reduced == 0) return;\n  c10::musa::MUSAGuard device_guard(feats.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(), \"add_reduce_traceback_grad_kernel\",\n        ([&] {\n          dim3 blocks(std::min(\n              at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(\n              grad_feats.data_ptr<scalar_t>(),\n              grad_reduced_feats.data_ptr<scalar_t>(),\n              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),\n              num_input, num_feats, reduce_type);\n        }));\n\n    AT_MUSA_CHECK(musaGetLastError());\n  } else {\n    auto reduce_from = at::full({num_reduced, num_feats}, num_input,\n                                coors_map.options().dtype(torch::kInt32));\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(),\n        \"max_reduce_traceback_scatter_idx_kernel\", ([&] {\n          dim3 blocks(std::min(\n              at::musa::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,\n                                                    stream>>>(\n              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),\n              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),\n              num_input, num_feats);\n        }));\n\n    AT_MUSA_CHECK(musaGetLastError());\n\n    AT_DISPATCH_FLOATING_TYPES(\n        grad_reduced_feats.scalar_type(),\n        \"max_reduce_traceback_scatter_idx_kernel\", ([&] {\n          dim3 blocks(\n              std::min(at::musa::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),\n                       maxGridDim));\n          dim3 threads(THREADS_PER_BLOCK);\n          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(\n              grad_feats.data_ptr<scalar_t>(),\n              grad_reduced_feats.data_ptr<scalar_t>(),\n              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);\n        }));\n\n    AT_MUSA_CHECK(musaGetLastError());\n  }\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/sparse_indice.mu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <spconv/indice.muh>\n#include <type_traits>\n\n#include \"pytorch_musa_helper.hpp\"\n\nnamespace functor {\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose) {\n    Index batchSize = gridsOut.dim(0);\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    if (transpose)\n      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,\n                              indiceNum, indicePairUnique, kernelSize, stride,\n                              padding, dilation, outSpatialShape);\n    else\n      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,\n                              indiceNum, indicePairUnique, kernelSize, stride,\n                              padding, dilation, outSpatialShape);\n    TV_CHECK_MUSA_ERR();\n    return 1;\n  }\n};\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<Index> indicesOut,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   tv::TensorView<Index> indicePairUnique,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    Index batchSize = gridsOut.dim(0);\n    auto kernelVolume = indicePairs.dim(0);\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    Index numAct = indicePairUnique.dim(0) - 1;\n    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numAct), tv::launch::MUSA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,\n                            indicePairUnique, outSpatialShape, batchSize);\n    TV_CHECK_MUSA_ERR();\n    assignIndicePairsKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,\n                            indicePairUnique, outSpatialShape);\n    TV_CHECK_MUSA_ERR();\n\n    if (resetGrid) {\n      resetGridKernel<Index, IndexGrid, NDim>\n          <<<tv::launch::getBlocks(numAct), tv::launch::MUSA_NUM_THREADS, 0,\n             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);\n      TV_CHECK_MUSA_ERR();\n    }\n    return numAct;\n  }\n};\n\ntemplate <typename Index, typename IndexGrid, unsigned NDim>\nstruct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {\n  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,\n                   tv::TensorView<IndexGrid> gridsOut,\n                   tv::TensorView<Index> indicePairs,\n                   tv::TensorView<Index> indiceNum,\n                   const tv::SimpleVector<Index, NDim> kernelSize,\n                   const tv::SimpleVector<Index, NDim> stride,\n                   const tv::SimpleVector<Index, NDim> padding,\n                   const tv::SimpleVector<Index, NDim> dilation,\n                   const tv::SimpleVector<Index, NDim> outSpatialShape,\n                   bool transpose, bool resetGrid) {\n    auto numActIn = indicesIn.dim(0);\n    if (numActIn == 0) return 0;\n    prepareSubMGridKernel<Index, IndexGrid, NDim>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);\n    TV_CHECK_MUSA_ERR();\n    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>\n        <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,\n                            kernelSize, stride, padding, dilation,\n                            outSpatialShape);\n    TV_CHECK_MUSA_ERR();\n\n    if (resetGrid) {\n      resetGridSubMKernel<Index, IndexGrid, NDim>\n          <<<tv::launch::getBlocks(numActIn), tv::launch::MUSA_NUM_THREADS, 0,\n             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,\n                              numActIn);\n      TV_CHECK_MUSA_ERR();\n    }\n    return numActIn;\n  }\n};\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \\\n  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \\\n                                                       int, NDIM>;            \\\n  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \\\n                                                         int, NDIM>;          \\\n  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \\\n                                                         int, NDIM>;          \\\n  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \\\n                                                       int, NDIM>;\n\n#define DECLARE_GPU_INDEX(Index)          \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \\\n  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);\n\nDECLARE_GPU_INDEX(int);\n\n#undef DECLARE_GPU_INDEX\n#undef DECLARE_GPU_SPECS_INDEX_NDIM\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/sparse_maxpool.mu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/maxpool.h>\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <type_traits>\n#include <utils/spconv/tensorview/helper_kernel.muh>\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,\n                                      const scalar_t *inFeatures,\n                                      const Index *indicesIn,\n                                      const Index *indicesOut, int numHot,\n                                      int numPlanes) {\n  scalar_t in, out;\n  int ILPStrideY[NumILP];\n  Index idxo, idxi;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x; ix < numHot;\n       ix += blockDim.x * gridDim.x) {\n    {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        in = inFeatures[idxi];\n        out = outFeatures[idxo];\n        if (in > out) {\n          outFeatures[idxo] = in;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,\n                                             const scalar_t *inFeatures,\n                                             const Index *indicesIn,\n                                             const Index *indicesOut,\n                                             int numHot, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        in = inFeatures[RI[ilp] + iy];\n        out = outFeatures[RO[ilp] + iy];\n        if (in > out) {\n          outFeatures[RO[ilp] + iy] = in;\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,\n                                         const scalar_t *inFeatures,\n                                         const Index *indicesIn,\n                                         const Index *indicesOut, int numHot,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n  scalar_t bufi[vecloadFactor];\n  scalar_t bufo[vecloadFactor];\n  Index idxi, idxo;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;\n       ix += blockDim.x * gridDim.x * vecloadFactor) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(bufo)[0] =\n          reinterpret_cast<VecType *>(outFeatures)[idxo];\n      reinterpret_cast<VecType *>(bufi)[0] =\n          reinterpret_cast<const VecType *>(inFeatures)[idxi];\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        if (bufi[i] > bufo[i]) {\n          bufo[i] = bufi[i];\n        }\n      }\n      reinterpret_cast<VecType *>(outFeatures)[idxo] =\n          reinterpret_cast<VecType *>(bufo)[0];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,\n                                        const scalar_t *inFeatures,\n                                        const Index *indicesIn,\n                                        const Index *indicesOut, int numHot,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < numHot) {\n        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n      }\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < numHot) {\n          in = inFeatures[RI[ilp] + iy];\n          out = outFeatures[RO[ilp] + iy];\n          if (in > out) {\n            outFeatures[RO[ilp] + iy] = in;\n          }\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,\n                                      const scalar_t *inFeatures,\n                                      const scalar_t *fout, scalar_t *fin,\n                                      const Index *indicesIn,\n                                      const Index *indicesOut, int numHot,\n                                      int numPlanes) {\n  scalar_t in, out;\n  Index idxo, idxi;\n  int ILPStrideY[NumILP];\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  fout += blockIdx.y * NumTLP;\n  fin += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x; ix < numHot;\n       ix += blockDim.x * gridDim.x) {\n    {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n        in = inFeatures[idxi];\n        out = outFeatures[idxo];\n        if (in == out) {\n          fin[idxi] += fout[idxo];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdGenericBlockKernel(\n    const scalar_t *outFeatures, const scalar_t *inFeatures,\n    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,\n    const Index *indicesOut, int numHot, int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        in = inFeatures[RI[ilp] + iy];\n        out = outFeatures[RO[ilp] + iy];\n        if (in == out) {\n          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP,\n          typename VecType>\n__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,\n                                         const scalar_t *inFeatures,\n                                         const scalar_t *fout, scalar_t *fin,\n                                         const Index *indicesIn,\n                                         const Index *indicesOut, int numHot,\n                                         int numPlanes) {\n  int ILPStrideY[NumILP];\n  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);\n  scalar_t bufi[vecloadFactor];\n  scalar_t bufo[vecloadFactor];\n  scalar_t bufdi[vecloadFactor];\n  scalar_t bufdo[vecloadFactor];\n  Index idxi, idxo;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;\n  outFeatures += blockIdx.y * NumTLP;\n  inFeatures += blockIdx.y * NumTLP;\n  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;\n       ix += blockDim.x * gridDim.x * vecloadFactor) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ++ilp) {\n      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;\n      reinterpret_cast<VecType *>(bufo)[0] =\n          reinterpret_cast<const VecType *>(outFeatures)[idxo];\n      reinterpret_cast<VecType *>(bufi)[0] =\n          reinterpret_cast<const VecType *>(inFeatures)[idxi];\n      reinterpret_cast<VecType *>(bufdo)[0] =\n          reinterpret_cast<const VecType *>(fout)[idxo];\n      reinterpret_cast<VecType *>(bufdi)[0] =\n          reinterpret_cast<VecType *>(fin)[idxi];\n\n#pragma unroll\n      for (int i = 0; i < vecloadFactor; i++) {\n        if (bufi[i] == bufo[i]) {\n          bufdi[i] += bufdo[i];\n        }\n      }\n      reinterpret_cast<VecType *>(fin)[idxi] =\n          reinterpret_cast<VecType *>(bufdi)[0];\n    }\n  }\n}\n\ntemplate <typename scalar_t, typename Index, int NumTLP, int NumILP>\n__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,\n                                        const scalar_t *inFeatures,\n                                        const scalar_t *fout, scalar_t *fin,\n                                        const Index *indicesIn,\n                                        const Index *indicesOut, int numHot,\n                                        int numPlanes) {\n  int ILPStrideX[NumILP];\n  Index RI[NumILP];\n  Index RO[NumILP];\n  scalar_t in, out;\n#pragma unroll\n  for (int ilp = 0; ilp < NumILP; ilp++)\n    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;\n  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {\n#pragma unroll\n    for (int ilp = 0; ilp < NumILP; ilp++) {\n      if (ix + ILPStrideX[ilp] < numHot) {\n        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;\n        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;\n      }\n    }\n    for (int iy : tv::KernelLoopY<int>(numPlanes)) {\n#pragma unroll\n      for (int ilp = 0; ilp < NumILP; ++ilp) {\n        if (ix + ILPStrideX[ilp] < numHot) {\n          in = inFeatures[RI[ilp] + iy];\n          out = outFeatures[RO[ilp] + iy];\n          if (in == out) {\n            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];\n          }\n        }\n      }\n    }\n  }\n}\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = inFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (numHotBlock >= NumTLP) {\n            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                    indices.subview(0).data(),\n                                    indices.subview(1).data(), numHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_MUSA_ERR();\n          }\n\n          if (size > numHotBlock) {\n            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                       indices.subview(0).data() + numHotBlock,\n                                       indices.subview(1).data() + numHotBlock,\n                                       size - numHotBlock, numPlanes);\n            TV_CHECK_MUSA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (numHotBlock >= NumTLP) {\n        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(),\n                indices.subview(0).data(), indices.subview(1).data(),\n                numHotBlock, numPlanes);\n        TV_CHECK_MUSA_ERR();\n      }\n\n      if (size > numHotBlock) {\n        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(),\n                indices.subview(0).data() + numHotBlock,\n                indices.subview(1).data() + numHotBlock, size - numHotBlock,\n                numPlanes);\n        TV_CHECK_MUSA_ERR();\n      }\n    }\n  }\n};\n\ntemplate <typename scalar_t, typename Index>\nstruct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d,\n                  tv::TensorView<const scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> inFeatures,\n                  tv::TensorView<const scalar_t> fout,\n                  tv::TensorView<scalar_t> fin,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = inFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,\n                                 &indices, &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (numHotBlock >= NumTLP) {\n            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                    fout.data(), fin.data(),\n                                    indices.subview(0).data(),\n                                    indices.subview(1).data(), numHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_MUSA_ERR();\n          }\n\n          if (size > numHotBlock) {\n            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),\n                                       fout.data(), fin.data(),\n                                       indices.subview(0).data() + numHotBlock,\n                                       indices.subview(1).data() + numHotBlock,\n                                       size - numHotBlock, numPlanes);\n            TV_CHECK_MUSA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      int numHotBlock = (size / NumTLP) * NumTLP;\n      if (numHotBlock >= NumTLP) {\n        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),\n                indices.subview(0).data(), indices.subview(1).data(),\n                numHotBlock, numPlanes);\n        TV_CHECK_MUSA_ERR();\n      }\n\n      if (size > numHotBlock) {\n        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>\n            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),\n               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),\n                indices.subview(0).data() + numHotBlock,\n                indices.subview(1).data() + numHotBlock, size - numHotBlock,\n                numPlanes);\n        TV_CHECK_MUSA_ERR();\n      }\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \\\n  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \\\n                                                       Index>;                 \\\n  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \\\n                                                        scalar_t, Index>;\n\n#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);\n\nDECLARE_GPU_SPECS(float);\nDECLARE_GPU_SPECS(double);\nDECLARE_GPU_SPECS(at::Half);\n\n#undef DECLARE_GPU_SPECS\n#undef DECLARE_GPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/sparse_pool_ops_musa.mu",
    "content": "#include <musa_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/maxpool.h>\n\n#include \"pytorch_musa_helper.hpp\"\n\ntorch::Tensor IndiceMaxpoolForwardMUSAKernelLauncher(torch::Tensor features,\n                                                     torch::Tensor indicePairs,\n                                                     torch::Tensor indiceNum,\n                                                     int64_t numAct) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  auto device = features.device().type();\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0) {\n      continue;\n    }\n    AT_DISPATCH_FLOATING_TYPES(\n        features.scalar_type(), \"IndiceMaxpoolForwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>\n                forwardFtor;\n            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),\n                        tv::torch2tv<const scalar_t>(features),\n                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n          } else {\n            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>\n                forwardFtor;\n            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                        tv::torch2tv<const scalar_t>(features),\n                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n            TV_CHECK_MUSA_ERR();\n          }\n        });\n  }\n  return output;\n}\n\ntorch::Tensor IndiceMaxpoolBackwardMUSAKernelLauncher(torch::Tensor features,\n                                                      torch::Tensor outFeatures,\n                                                      torch::Tensor outGrad,\n                                                      torch::Tensor indicePairs,\n                                                      torch::Tensor indiceNum) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  auto device = features.device().type();\n  auto numInPlanes = features.size(1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);\n  auto kernelVolume = indicePairs.size(0);\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0) {\n      continue;\n    }\n    AT_DISPATCH_FLOATING_TYPES(\n        features.scalar_type(), \"IndiceMaxpoolBackwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>\n                backwardFtor;\n            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),\n                         tv::torch2tv<const scalar_t>(features),\n                         tv::torch2tv<const scalar_t>(outGrad),\n                         tv::torch2tv<scalar_t>(inputGrad),\n                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n          } else {\n            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>\n                backwardFtor;\n            backwardFtor(tv::TorchGPU(),\n                         tv::torch2tv<const scalar_t>(outFeatures),\n                         tv::torch2tv<const scalar_t>(features),\n                         tv::torch2tv<const scalar_t>(outGrad),\n                         tv::torch2tv<scalar_t>(inputGrad),\n                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);\n            TV_CHECK_MUSA_ERR();\n          }\n        });\n  }\n  return inputGrad;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/sparse_reordering.mu",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <ATen/ATen.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/mp_helper.h>\n#include <utils/spconv/spconv/reordering.h>\n#include <utils/spconv/tensorview/helper_launch.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\n#include <chrono>\n#include <limits>\n#include <spconv/reordering.muh>\n#include <type_traits>\n#include <utils/spconv/tensorview/helper_kernel.muh>\n\n#include \"pytorch_musa_helper.hpp\"\n\nnamespace functor {\ntemplate <typename scalar_t, typename Index>\nstruct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,\n                  tv::TensorView<const scalar_t> features,\n                  tv::TensorView<const Index> indices, int size) {\n    if (size <= 0) return;\n    int numPlanes = features.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);\n    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n      int nHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (nHotBlock >= NumTLP) {\n            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                 vecload_type_t>\n                <<<dim3(numPlanes / NumTLP, size / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(buffer.data(), features.data(),\n                                    indices.data(), nHotBlock,\n                                    numPlanes / vecloadFactor);\n\n            TV_CHECK_MUSA_ERR();\n          }\n          if (size - nHotBlock > 0) {\n            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,\n                            vecload_type_t>\n                <<<dim3(1, numPlanes / NumTLP),\n                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,\n                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,\n                                    features.data(), indices.data() + nHotBlock,\n                                    size - nHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_MUSA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>\n          <<<dim3(tv::launch::DivUp(size, NumTLP),\n                  tv::launch::DivUp(numPlanes, NumTLP)),\n             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n              buffer.data(), features.data(), indices.data(), size, numPlanes);\n      TV_CHECK_MUSA_ERR();\n    }\n  }\n};\ntemplate <typename scalar_t, typename Index>\nstruct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {\n  using vecload_type_t =\n      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;\n  using kernel_block_t = mp_list_c<int, 64, 32, 16>;\n  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,\n                  tv::TensorView<const scalar_t> buffer,\n                  tv::TensorView<const Index> indices, int size, bool stable) {\n    if (size <= 0) return;\n    int numPlanes = outFeatures.dim(1);\n    bool notFound = true;\n    constexpr int vecloadFactor =\n        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.\n    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,\n                                 &notFound](auto NumTLP) {\n      constexpr int NumILP = NumTLP / 4;\n      int nHotBlock = (size / NumTLP) * NumTLP;\n      if (notFound) {\n        if (numPlanes % NumTLP == 0) {\n          if (nHotBlock >= NumTLP) {\n            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,\n                                     vecload_type_t>\n                <<<dim3(numPlanes / NumTLP, size / NumTLP),\n                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,\n                   d.getStream()>>>(outFeatures.data(), buffer.data(),\n                                    indices.data(), nHotBlock,\n                                    numPlanes / vecloadFactor);\n            TV_CHECK_MUSA_ERR();\n          }\n          if (size - nHotBlock > 0) {\n            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>\n                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),\n                   0, d.getStream()>>>(\n                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,\n                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);\n            TV_CHECK_MUSA_ERR();\n          }\n          notFound = false;\n        }\n      }\n    });\n    if (notFound) {\n      constexpr int NumTLP = 64;\n      constexpr int NumILP = NumTLP / 4;\n      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>\n          <<<dim3(tv::launch::DivUp(size, NumTLP),\n                  tv::launch::DivUp(numPlanes, NumTLP)),\n             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(\n              outFeatures.data(), buffer.data(), indices.data(), size,\n              numPlanes);\n      TV_CHECK_MUSA_ERR();\n    }\n  }\n};\n\n}  // namespace functor\n\n#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \\\n  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \\\n  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \\\n                                                   Index>;\n\n#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);\n\nDECLARE_GPU_SPECS(float);\nDECLARE_GPU_SPECS(double);\nDECLARE_GPU_SPECS(at::Half);\n\n#undef DECLARE_GPU_SPECS\n#undef DECLARE_GPU_SPECS_T_INDEX\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/spconv_ops_musa.mu",
    "content": "#include <musa_runtime_api.h>\n#include <torch/script.h>\n// clang-format off\n// TODO: make spconv_utils.h order agnostic\n#include \"../spconv_utils.h\"\n// clang-format on\n#include <utils/spconv/spconv/indice.h>\n#include <utils/spconv/spconv/reordering.h>\n\n#include \"pytorch_musa_helper.hpp\"\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  c10::musa::MUSAGuard device_guard(indices.device());\n  bool subM = _subM != 0;\n  bool transpose = _transpose != 0;\n  auto numAct = indices.size(0);\n  auto coorDim = indices.size(1) - 1;\n  TV_ASSERT_RT_ERR(NDim == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(stride.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(padding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(dilation.size() == coorDim, \"error\");\n  auto kernelVolume = kernelSize[0];\n  for (int i = 1; i < kernelSize.size(); ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  TV_ASSERT_RT_ERR(kernelVolume <= 4096, \"error\");\n  auto outputVolume = outSpatialShape[0];\n  for (int i = 1; i < outSpatialShape.size(); ++i) {\n    outputVolume *= outSpatialShape[i];\n  }\n  torch::Tensor indicePairs =\n      torch::full({kernelVolume, 2, numAct}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor indiceNum = torch::zeros(\n      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor gridOut =\n      torch::full({batchSize * outputVolume}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  int64_t numActOut = -1;\n  tv::SimpleVector<int, NDim> outSpatialShape32;\n  tv::SimpleVector<int, NDim> kernelSize32;\n  tv::SimpleVector<int, NDim> stride32;\n  tv::SimpleVector<int, NDim> padding32;\n  tv::SimpleVector<int, NDim> dilation32;\n  auto indicePairUnique = torch::full(\n      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),\n      torch::dtype(torch::kInt32).device(indices.device()));\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShape32.push_back(outSpatialShape[i]);\n    kernelSize32.push_back(kernelSize[i]);\n    if (subM) {\n      stride32.push_back(1);\n      padding32.push_back(kernelSize[i] / 2);\n      dilation32.push_back(dilation[i]);\n    } else {\n      stride32.push_back(stride[i]);\n      padding32.push_back(padding[i]);\n      dilation32.push_back(dilation[i]);\n    }\n  }\n  if (subM) {\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n    } else {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n    }\n    return {indices, indicePairs, indiceNum};\n  } else {\n    torch::Tensor outInds =\n        torch::zeros({numAct * kernelVolume, coorDim + 1},\n                     torch::dtype(torch::kInt32).device(indices.device()));\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,\n          transpose);\n    } else {\n      auto getIndicePairFtorP1 =\n          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,\n                                                 NDim>();\n      auto getIndicePairFtorP2 =\n          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,\n                                                 NDim>();\n      numActOut = getIndicePairFtorP1(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,\n          padding32, dilation32, outSpatialShape32, transpose);\n      if (numActOut > 0) {\n        auto res = torch::_unique(indicePairUnique);\n        indicePairUnique = std::get<0>(res);\n        numActOut = getIndicePairFtorP2(\n            tv::TorchGPU(), tv::torch2tv<const int>(indices),\n            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);\n      }\n    }\n    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};\n  }\n}\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  c10::musa::MUSAGuard device_guard(indices.device());\n  bool subM = _subM != 0;\n  bool transpose = _transpose != 0;\n  auto numAct = indices.size(0);\n  auto coorDim = indices.size(1) - 1;\n  TV_ASSERT_RT_ERR(NDim == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(stride.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(padding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, \"error\");\n  TV_ASSERT_RT_ERR(dilation.size() == coorDim, \"error\");\n  auto kernelVolume = kernelSize[0];\n  for (int i = 1; i < kernelSize.size(); ++i) {\n    kernelVolume *= kernelSize[i];\n  }\n  TV_ASSERT_RT_ERR(kernelVolume <= 4096, \"error\");\n  auto outputVolume = outSpatialShape[0];\n  for (int i = 1; i < outSpatialShape.size(); ++i) {\n    outputVolume *= outSpatialShape[i];\n  }\n  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, \"error\");\n  torch::Tensor indicePairs =\n      torch::full({kernelVolume, 2, numAct}, -1,\n                  torch::dtype(torch::kInt32).device(indices.device()));\n  torch::Tensor indiceNum = torch::zeros(\n      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));\n  int64_t numActOut = -1;\n  tv::SimpleVector<int, NDim> outSpatialShape32;\n  tv::SimpleVector<int, NDim> kernelSize32;\n  tv::SimpleVector<int, NDim> stride32;\n  tv::SimpleVector<int, NDim> padding32;\n  tv::SimpleVector<int, NDim> dilation32;\n  auto indicePairUnique = torch::full(\n      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),\n      torch::dtype(torch::kInt32).device(indices.device()));\n  for (int i = 0; i < NDim; ++i) {\n    outSpatialShape32.push_back(outSpatialShape[i]);\n    kernelSize32.push_back(kernelSize[i]);\n    if (subM) {\n      stride32.push_back(1);\n      padding32.push_back(kernelSize[i] / 2);\n      dilation32.push_back(dilation[i]);\n    } else {\n      stride32.push_back(stride[i]);\n      padding32.push_back(padding[i]);\n      dilation32.push_back(dilation[i]);\n    }\n  }\n  if (subM) {\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose);\n      gridOut.fill_(-1);\n    } else {\n      auto getIndicePairFtor =\n          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),\n          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,\n          dilation32, outSpatialShape32, transpose, true);\n    }\n    return {indices, indicePairs, indiceNum};\n  } else {\n    torch::Tensor outInds =\n        torch::zeros({numAct * kernelVolume, coorDim + 1},\n                     torch::dtype(torch::kInt32).device(indices.device()));\n    if (indices.device().type() == torch::kCPU) {\n      auto getIndicePairFtor =\n          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();\n      numActOut = getIndicePairFtor(\n          tv::CPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,\n          transpose, true);\n      gridOut.fill_(-1);\n    } else {\n      auto getIndicePairFtorP1 =\n          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,\n                                                 NDim>();\n      auto getIndicePairFtorP2 =\n          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,\n                                                 NDim>();\n      numActOut = getIndicePairFtorP1(\n          tv::TorchGPU(), tv::torch2tv<const int>(indices),\n          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,\n          padding32, dilation32, outSpatialShape32, transpose);\n      if (numActOut > 0) {\n        auto res = torch::_unique(indicePairUnique);\n        indicePairUnique = std::get<0>(res);\n        numActOut = getIndicePairFtorP2(\n            tv::TorchGPU(), tv::torch2tv<const int>(indices),\n            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),\n            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),\n            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,\n            true);\n      }\n    }\n    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};\n  }\n}\n\ntorch::Tensor IndiceConvForwardMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,\n    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,\n    int64_t _subM) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n\n  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {\n    torch::mm_out(output, features, filters[indicePairMaxOffset]);\n  }\n  double totalGatherTime = 0;\n  double totalGEMMTime = 0;\n  double totalSAddTime = 0;\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES(\n        features.scalar_type(), \"IndiceConvForwardKernel\", [&] {\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_MUSA_ERR();\n            /* slower than SparseGatherFunctor, may due to int->long conversion\n            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);\n            auto indicePairBlob =\n            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},\n            indicePairOptions); torch::index_select_out(inputBufferBlob,\n            features, 0, indicePairBlob);*/\n          }\n          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);\n\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),\n                tv::torch2tv<const scalar_t>(outputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,\n                true);\n            TV_CHECK_MUSA_ERR();\n          }\n        });\n  }\n  return output;\n}\n\nstd::vector<torch::Tensor> IndiceConvBackwardMUSAKernelLauncher(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  c10::musa::MUSAGuard device_guard(features.device());\n  bool subM = _subM != 0;\n  bool inverse = _inverse != 0;\n\n  auto device = features.device().type();\n  auto ndim = filters.dim() - 2;\n  auto kernelVolume = indicePairs.size(0);\n  auto numInPlanes = features.size(1);\n  auto numOutPlanes = filters.size(ndim + 1);\n  auto indicePairNumCpu = indiceNum.to({torch::kCPU});\n  auto indicePairMaxSizeIter =\n      std::max_element(indicePairNumCpu.data_ptr<int>(),\n                       indicePairNumCpu.data_ptr<int>() + kernelVolume);\n  int indicePairMaxOffset =\n      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();\n  int indicePairMaxSize = *indicePairMaxSizeIter;\n  auto options =\n      torch::TensorOptions().dtype(features.dtype()).device(features.device());\n  auto filterShape = filters.sizes();\n  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);\n  torch::Tensor filtersGrad = torch::zeros(filterShape, options);\n  torch::Tensor inputBuffer =\n      torch::zeros({indicePairMaxSize, numInPlanes}, options);\n  torch::Tensor outputBuffer =\n      torch::zeros({indicePairMaxSize, numOutPlanes}, options);\n\n  filters = filters.view({-1, numInPlanes, numOutPlanes});\n  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});\n  if (subM) {\n    auto filterGradSub = filtersGrad[indicePairMaxOffset];\n    torch::mm_out(filterGradSub, features.t(), outGrad);\n    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());\n  }\n  for (int i = 0; i < kernelVolume; ++i) {\n    auto nHot = indicePairNumCpu.data_ptr<int>()[i];\n    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {\n      continue;\n    }\n\n    AT_DISPATCH_FLOATING_TYPES(\n        features.scalar_type(), \"IndiceConvBackwardKernel\", [&] {\n          if (device == torch::kCPU) {\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;\n            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;\n            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            gatherFtorOut(\n                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),\n                tv::torch2tv<const scalar_t>(outGrad),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),\n                nHot);\n          } else {\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtor;\n            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>\n                gatherFtorOut;\n            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),\n                       tv::torch2tv<const scalar_t>(features),\n                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),\n                       nHot);\n            TV_CHECK_MUSA_ERR();\n            gatherFtorOut(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),\n                tv::torch2tv<const scalar_t>(outGrad),\n                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),\n                nHot);\n            TV_CHECK_MUSA_ERR();\n          }\n          auto filterGradSub = filtersGrad[i];\n          auto outputBufferBlob = torch::from_blob(\n              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);\n          auto inputBufferBlob = torch::from_blob(\n              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);\n\n          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);\n          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());\n          if (device == torch::kCPU) {\n            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),\n                tv::torch2tv<const scalar_t>(inputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);\n          } else {\n            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>\n                scatterFtor;\n            scatterFtor(\n                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),\n                tv::torch2tv<const scalar_t>(inputBuffer),\n                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);\n            TV_CHECK_MUSA_ERR();\n          }\n        });\n  }\n  return {inputGrad, filtersGrad.view(filterShape)};\n}\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<2>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<3>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher<4>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher<2>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher<3>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/stack_ball_query_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"stack_ball_query_musa_kernel.muh\"\n#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))\n\nvoid StackBallQueryForwardMUSAKernelLauncher(float max_radius, int nsample,\n                                             const Tensor new_xyz,\n                                             const Tensor new_xyz_batch_cnt,\n                                             const Tensor xyz,\n                                             const Tensor xyz_batch_cnt,\n                                             Tensor idx) {\n  c10::musa::MUSAGuard device_guard(new_xyz.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();\n  //   const float *xyz_ptr = xyz.data_ptr<float>();\n  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();\n  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();\n  //   int *idx_ptr = idx.data_ptr<int>();\n\n  int B = xyz_batch_cnt.size(0);\n  int M = new_xyz.size(0);\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      new_xyz.scalar_type(), \"stack_ball_query_forward_musa_kernel\", [&] {\n        stack_ball_query_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),\n                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),\n                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/stack_group_points_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"stack_group_points_musa_kernel.muh\"\n\nvoid StackGroupPointsForwardMUSAKernelLauncher(\n    int b, int c, int m, int nsample, const Tensor features_tensor,\n    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,\n    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {\n  // points: (B, C, N)\n  // idx: (B, npoints, nsample)\n  // output:\n  //      out: (B, C, npoints, nsample)\n  c10::musa::MUSAGuard device_guard(features_tensor.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      features_tensor.scalar_type(), \"stack_group_points_forward_musa_kernel\",\n      [&] {\n        stack_group_points_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),\n                features_batch_cnt_tensor.data_ptr<int>(),\n                idx_tensor.data_ptr<int>(),\n                idx_batch_cnt_tensor.data_ptr<int>(),\n                out_tensor.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid StackGroupPointsBackwardMUSAKernelLauncher(\n    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,\n    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,\n    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {\n  c10::musa::MUSAGuard device_guard(grad_features_tensor.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_features_tensor.scalar_type(),\n      \"stack_group_points_backward_musa_kernel\", [&] {\n        stack_group_points_backward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),\n                idx_tensor.data_ptr<int>(),\n                idx_batch_cnt_tensor.data_ptr<int>(),\n                features_batch_cnt_tensor.data_ptr<int>(),\n                grad_features_tensor.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/sync_bn_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"sync_bn_musa_kernel.muh\"\n\nvoid SyncBNForwardMeanMUSAKernelLauncher(const Tensor input, Tensor mean) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      input.scalar_type(), \"sync_bn_forward_mean_musa_kernel\", [&] {\n        sync_bn_forward_mean_musa_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,\n                channels, spatial);\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SyncBNForwardVarMUSAKernelLauncher(const Tensor input, const Tensor mean,\n                                        Tensor var) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      input.scalar_type(), \"sync_bn_forward_mean_musa_kernel\", [&] {\n        sync_bn_forward_var_musa_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),\n                var.data_ptr<float>(), num, channels, spatial);\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SyncBNForwardOutputMUSAKernelLauncher(\n    const Tensor input, const Tensor mean, const Tensor var,\n    Tensor running_mean, Tensor running_var, const Tensor weight,\n    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,\n    float momentum, int group_size) {\n  int num = input.size(0);\n  int channels = input.size(1);\n  int spatial = input.size(2);\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      input.scalar_type(), \"sync_bn_forward_mean_musa_kernel\", [&] {\n        sync_bn_forward_output_musa_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),\n                var.data_ptr<float>(), running_mean.data_ptr<float>(),\n                running_var.data_ptr<float>(), weight.data_ptr<float>(),\n                bias.data_ptr<float>(), norm.data_ptr<float>(),\n                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,\n                channels, spatial, eps, momentum, group_size);\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SyncBNBackwardParamMUSAKernelLauncher(const Tensor grad_output,\n                                           const Tensor norm,\n                                           Tensor grad_weight,\n                                           Tensor grad_bias) {\n  int num = grad_output.size(0);\n  int channels = grad_output.size(1);\n  int spatial = grad_output.size(2);\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_output.scalar_type(), \"sync_bn_backward_param_musa_kernel\", [&] {\n        sync_bn_backward_param_musa_kernel<scalar_t>\n            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(\n                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),\n                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,\n                channels, spatial);\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid SyncBNBackwardDataMUSAKernelLauncher(const Tensor grad_output,\n                                          const Tensor weight,\n                                          const Tensor grad_weight,\n                                          const Tensor grad_bias,\n                                          const Tensor norm, const Tensor std,\n                                          Tensor grad_input) {\n  int output_size = grad_input.numel();\n  int num = grad_input.size(0);\n  int channels = grad_input.size(1);\n  int spatial = grad_input.size(2);\n\n  c10::musa::MUSAGuard device_guard(grad_input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES(\n      grad_output.scalar_type(), \"sync_bn_backward_data_musa_kernel\", [&] {\n        sync_bn_backward_data_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),\n                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),\n                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,\n                channels, spatial);\n      });\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/three_interpolate_musa.mu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"three_interpolate_musa_kernel.muh\"\n\nvoid ThreeInterpolateForwardMUSAKernelLauncher(int b, int c, int m, int n,\n                                               const Tensor points,\n                                               const Tensor idx,\n                                               const Tensor weight,\n                                               Tensor out) {\n  // points: (B, C, M)\n  // idx: (B, N, 3)\n  // weight: (B, N, 3)\n  // output:\n  //      out: (B, C, N)\n\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      points.scalar_type(), \"three_interpolate_forward_musa_kernel\", [&] {\n        three_interpolate_forward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid ThreeInterpolateBackwardMUSAKernelLauncher(int b, int c, int n, int m,\n                                                const Tensor grad_out,\n                                                const Tensor idx,\n                                                const Tensor weight,\n                                                Tensor grad_points) {\n  // grad_out: (B, C, N)\n  // weight: (B, N, 3)\n  // output:\n  //      grad_points: (B, C, M)\n\n  c10::musa::MUSAGuard device_guard(grad_out.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_out.scalar_type(), \"three_interpolate_backward_musa_kernel\", [&] {\n        three_interpolate_backward_musa_kernel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),\n                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/three_nn_musa.mu",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"three_nn_musa_kernel.muh\"\n\nvoid ThreeNNForwardMUSAKernelLauncher(int b, int n, int m, const Tensor unknown,\n                                      const Tensor known, Tensor dist2,\n                                      Tensor idx) {\n  // unknown: (B, N, 3)\n  // known: (B, M, 3)\n  // output:\n  //      dist2: (B, N, 3)\n  //      idx: (B, N, 3)\n\n  c10::musa::MUSAGuard device_guard(unknown.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  // blockIdx.x(col), blockIdx.y(row)\n  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES(\n      unknown.scalar_type(), \"three_nn_forward_musa_kernel\", [&] {\n        three_nn_forward_musa_kernel<scalar_t><<<blocks, threads, 0, stream>>>(\n            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),\n            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/tin_shift_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_musa_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#include \"tin_shift_musa_kernel.muh\"\n\nvoid TINShiftForwardMUSAKernelLauncher(Tensor input, Tensor shift,\n                                       Tensor output) {\n  int output_size = output.numel();\n  int batch_size = input.size(0);\n  int t_size = input.size(1);\n  int channels = input.size(2);\n  int hw_size = input.size(3);\n  int group_size = shift.size(1);\n  int group_channel = channels / group_size;\n  int num_kernels = batch_size * hw_size * channels;\n\n  c10::musa::MUSAGuard device_guard(input.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      input.scalar_type(), \"tin_shift_forward_musa_kernel\", [&] {\n        tin_shift_forward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),\n                output.data_ptr<scalar_t>(), batch_size, channels, t_size,\n                hw_size, group_size, group_channel);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n\nvoid TINShiftBackwardMUSAKernelLauncher(Tensor grad_output, Tensor shift,\n                                        Tensor grad_input) {\n  int output_size = grad_output.numel();\n  int batch_size = grad_output.size(0);\n  int t_size = grad_output.size(1);\n  int channels = grad_output.size(2);\n  int hw_size = grad_output.size(3);\n  int group_size = shift.size(1);\n  int group_channel = channels / group_size;\n  int num_kernels = batch_size * hw_size * channels;\n\n  c10::musa::MUSAGuard device_guard(grad_output.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(\n      grad_output.scalar_type(), \"tin_shift_backward_musa_kernel\", [&] {\n        tin_shift_backward_musa_kernel<scalar_t>\n            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(\n                output_size, grad_output.data_ptr<scalar_t>(),\n                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),\n                batch_size, channels, t_size, hw_size, group_size,\n                group_channel);\n      });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/upfirdn2d_kernel.mu",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n#include <c10/util/Half.h>\n#include <torch/types.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#if MUSA_ARCH > 21\nstruct upfirdn2d_kernel_params {\n  const void *x;\n  const float *f;\n  void *y;\n\n  int2 up;\n  int2 down;\n  int2 pad0;\n  int flip;\n  float gain;\n\n  int4 inSize;  // [width, height, channel, batch]\n  int4 inStride;\n  int2 filterSize;  // [width, height]\n  int2 filterStride;\n  int4 outSize;  // [width, height, channel, batch]\n  int4 outStride;\n  int sizeMinor;\n  int sizeMajor;\n\n  int loopMinor;\n  int loopMajor;\n  int loopX;\n  int launchMinor;\n  int launchMajor;\n};\n\n//------------------------------------------------------------------------\n// MUSA kernel specialization.\n\nstruct upfirdn2d_kernel_spec {\n  void *kernel;\n  int tileOutW;\n  int tileOutH;\n  int loopMinor;\n  int loopX;\n};\n\n//------------------------------------------------------------------------\n// MUSA kernel selection.\n\ntemplate <class T>\nupfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);\n//------------------------------------------------------------------------\n\n// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors retain all intellectual property\n// and proprietary rights in and to this software, related documentation\n// and any modifications thereto.  Any use, reproduction, disclosure or\n// distribution of this software and related documentation without an express\n// license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n//------------------------------------------------------------------------\n// Helpers.\n\ntemplate <class T>\nstruct InternalType;\ntemplate <>\nstruct InternalType<double> {\n  typedef double scalar_t;\n};\ntemplate <>\nstruct InternalType<float> {\n  typedef float scalar_t;\n};\ntemplate <>\nstruct InternalType<c10::Half> {\n  typedef float scalar_t;\n};\n\nstatic __device__ __forceinline__ int floor_div(int a, int b) {\n  int t = 1 - a / b;\n  return (a + t * b) / b - t;\n}\n\n//------------------------------------------------------------------------\n// Generic MUSA implementation for large filters.\n\ntemplate <class T>\nstatic __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n\n  // Calculate thread index.\n  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;\n  int outY = minorBase / p.launchMinor;\n  minorBase -= outY * p.launchMinor;\n  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;\n  int majorBase = blockIdx.z * p.loopMajor;\n  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)\n    return;\n\n  // Setup Y receptive field.\n  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;\n  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);\n  int h =\n      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;\n  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;\n  if (p.flip) filterY = p.filterSize.y - 1 - filterY;\n\n  // Loop over major, minor, and X.\n  for (int majorIdx = 0, major = majorBase;\n       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)\n    for (int minorIdx = 0, minor = minorBase;\n         minorIdx < p.loopMinor & minor < p.sizeMinor;\n         minorIdx++, minor += p.launchMinor) {\n      int nc = major * p.sizeMinor + minor;\n      int n = nc / p.inSize.z;\n      int c = nc - n * p.inSize.z;\n      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;\n           loopX++, outX += blockDim.y) {\n        // Setup X receptive field.\n        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;\n        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);\n        int w =\n            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -\n            inX;\n        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;\n        if (p.flip) filterX = p.filterSize.x - 1 - filterX;\n\n        // Initialize pointers.\n        const T *xp =\n            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +\n                              c * p.inStride.z + n * p.inStride.w];\n        const float *fp =\n            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];\n        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;\n        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;\n\n        // Inner loop.\n        scalar_t v = 0;\n        for (int y = 0; y < h; y++) {\n          for (int x = 0; x < w; x++) {\n            v += (scalar_t)(*xp) * (scalar_t)(*fp);\n            xp += p.inStride.x;\n            fp += filterStepX;\n          }\n          xp += p.inStride.y - w * p.inStride.x;\n          fp += filterStepY - w * filterStepX;\n        }\n\n        // Store result.\n        v *= p.gain;\n        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +\n                   c * p.outStride.z + n * p.outStride.w] = (T)v;\n      }\n    }\n}\n\n//------------------------------------------------------------------------\n// Specialized MUSA implementation for small filters.\n\ntemplate <class T, int upx, int upy, int downx, int downy, int filterW,\n          int filterH, int tileOutW, int tileOutH, int loopMinor>\nstatic __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {\n  typedef typename InternalType<T>::scalar_t scalar_t;\n  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;\n  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;\n  __shared__ volatile scalar_t sf[filterH][filterW];\n  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];\n\n  // Calculate tile index.\n  int minorBase = blockIdx.x;\n  int tileOutY = minorBase / p.launchMinor;\n  minorBase -= tileOutY * p.launchMinor;\n  minorBase *= loopMinor;\n  tileOutY *= tileOutH;\n  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;\n  int majorBase = blockIdx.z * p.loopMajor;\n  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |\n      majorBase >= p.sizeMajor)\n    return;\n\n  // Load filter (flipped).\n  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;\n       tapIdx += blockDim.x) {\n    int fy = tapIdx / filterW;\n    int fx = tapIdx - fy * filterW;\n    scalar_t v = 0;\n    if (fx < p.filterSize.x & fy < p.filterSize.y) {\n      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;\n      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;\n      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];\n    }\n    sf[fy][fx] = v;\n  }\n\n  // Loop over major and X.\n  for (int majorIdx = 0, major = majorBase;\n       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {\n    int baseNC = major * p.sizeMinor + minorBase;\n    int n = baseNC / p.inSize.z;\n    int baseC = baseNC - n * p.inSize.z;\n    for (int loopX = 0, tileOutX = tileOutXBase;\n         loopX < p.loopX & tileOutX < p.outSize.x;\n         loopX++, tileOutX += tileOutW) {\n      // Load input pixels.\n      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;\n      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;\n      int tileInX = floor_div(tileMidX, upx);\n      int tileInY = floor_div(tileMidY, upy);\n      __syncthreads();\n      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;\n           inIdx += blockDim.x) {\n        int relC = inIdx;\n        int relInX = relC / loopMinor;\n        int relInY = relInX / tileInW;\n        relC -= relInX * loopMinor;\n        relInX -= relInY * tileInW;\n        int c = baseC + relC;\n        int inX = tileInX + relInX;\n        int inY = tileInY + relInY;\n        scalar_t v = 0;\n        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &\n            c < p.inSize.z)\n          v = (scalar_t)(\n              (const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +\n                              c * p.inStride.z + n * p.inStride.w];\n        sx[relInY][relInX][relC] = v;\n      }\n\n      // Loop over output pixels.\n      __syncthreads();\n      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;\n           outIdx += blockDim.x) {\n        int relC = outIdx;\n        int relOutX = relC / loopMinor;\n        int relOutY = relOutX / tileOutW;\n        relC -= relOutX * loopMinor;\n        relOutX -= relOutY * tileOutW;\n        int c = baseC + relC;\n        int outX = tileOutX + relOutX;\n        int outY = tileOutY + relOutY;\n\n        // Setup receptive field.\n        int midX = tileMidX + relOutX * downx;\n        int midY = tileMidY + relOutY * downy;\n        int inX = floor_div(midX, upx);\n        int inY = floor_div(midY, upy);\n        int relInX = inX - tileInX;\n        int relInY = inY - tileInY;\n        int filterX = (inX + 1) * upx - midX - 1;  // flipped\n        int filterY = (inY + 1) * upy - midY - 1;  // flipped\n\n        // Inner loop.\n        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {\n          scalar_t v = 0;\n#pragma unroll\n          for (int y = 0; y < filterH / upy; y++)\n#pragma unroll\n            for (int x = 0; x < filterW / upx; x++)\n              v += sx[relInY + y][relInX + x][relC] *\n                   sf[filterY + y * upy][filterX + x * upx];\n          v *= p.gain;\n          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +\n                     c * p.outStride.z + n * p.outStride.w] = (T)v;\n        }\n      }\n    }\n  }\n}\n\n//------------------------------------------------------------------------\n// MUSA kernel selection.\n\ntemplate <class T>\nupfirdn2d_kernel_spec choose_upfirdn2d_kernel(\n    const upfirdn2d_kernel_params &p) {\n  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;\n  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,\n                                4};  // contiguous\n  if (s == 1)\n    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last\n\n  // No up/downsampling.\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 7 && fy <= 7)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 5 && fy <= 5)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 3 && fy <= 3)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 7 && fy <= 7)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 5 && fy <= 5)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 3 && fy <= 3)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 2x upsampling.\n  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,\n              64, 16, 1, 1};\n    if (s != 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,\n              64, 16, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,\n              16, 16, 8, 1};\n    if (s == 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,\n              16, 16, 8, 1};\n  }\n  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 2x downsampling.\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,\n              8, 1, 1};\n    if (s != 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,\n              8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,\n              16, 16, 1, 1};\n    if (s == 1 && fx <= 16 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,\n              16, 16, 1, 1};\n    if (s == 1 && fx <= 8 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 6 && fy <= 6)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 4 && fy <= 4)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,\n              8, 8, 1};\n    if (s == 1 && fx <= 2 && fy <= 2)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,\n              8, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,\n              64, 8, 1, 1};\n    if (s != 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,\n              64, 8, 1, 1};\n    if (s != 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,\n              8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 24 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,\n              64, 1, 8, 1};\n    if (s == 1 && fx <= 16 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,\n              64, 1, 8, 1};\n    if (s == 1 && fx <= 8 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,\n              1, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,\n              32, 16, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,\n              32, 16, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 24)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,\n              64, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 16)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,\n              64, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 8)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,\n              64, 8, 1};\n  }\n\n  // 4x upsampling.\n  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,\n              64, 32, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,\n              64, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s == 1 && fx <= 32 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,\n              32, 32, 1, 1};\n  }\n  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,\n              128, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n    if (s == 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,\n              128, 1, 16, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,\n              32, 32, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,\n              32, 32, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,\n              1, 128, 16, 1};\n    if (s == 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,\n              1, 128, 16, 1};\n  }\n\n  // 4x downsampling (inefficient).\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {\n    // contiguous\n    if (s != 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,\n              32, 8, 1, 1};\n    if (s != 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,\n              32, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 48 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,\n              32, 1, 8, 1};\n    if (s == 1 && fx <= 32 && fy <= 1)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,\n              32, 1, 8, 1};\n  }\n  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {\n    // contiguous\n    if (s != 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,\n              32, 8, 1, 1};\n    if (s != 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,\n              32, 8, 1, 1};\n    // channels_last\n    if (s == 1 && fx <= 1 && fy <= 48)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,\n              32, 8, 1};\n    if (s == 1 && fx <= 1 && fy <= 32)\n      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,\n              32, 8, 1};\n  }\n  return spec;\n}\n\n//------------------------------------------------------------------------\n// Template specializations.\n\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(\n    const upfirdn2d_kernel_params &p);\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(\n    const upfirdn2d_kernel_params &p);\ntemplate upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(\n    const upfirdn2d_kernel_params &p);\n\n//------------------------------------------------------------------------\n\n//------------------------------------------------------------------------\n\ntorch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,\n                           int downx, int downy, int padx0, int padx1,\n                           int pady0, int pady1, bool flip, float gain) {\n  // Validate arguments.\n  TORCH_CHECK(x.is_privateuseone(), \"x must reside on MUSA device\");\n  TORCH_CHECK(f.device() == x.device(),\n              \"f must reside on the same device as x\");\n  TORCH_CHECK(f.dtype() == torch::kFloat, \"f must be float32\");\n  TORCH_CHECK(x.numel() <= INT_MAX, \"x is too large\");\n  TORCH_CHECK(f.numel() <= INT_MAX, \"f is too large\");\n  TORCH_CHECK(x.numel() > 0, \"x has zero size\");\n  TORCH_CHECK(f.numel() > 0, \"f has zero size\");\n  TORCH_CHECK(x.dim() == 4, \"x must be rank 4\");\n  TORCH_CHECK(f.dim() == 2, \"f must be rank 2\");\n  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +\n                      (x.size(2) - 1) * x.stride(2) +\n                      (x.size(3) - 1) * x.stride(3) <=\n                  INT_MAX,\n              \"x memory footprint is too large\");\n  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, \"f must be at least 1x1\");\n  TORCH_CHECK(upx >= 1 && upy >= 1, \"upsampling factor must be at least 1\");\n  TORCH_CHECK(downx >= 1 && downy >= 1,\n              \"downsampling factor must be at least 1\");\n\n  // Create output tensor.\n  const at::musa::OptionalMUSAGuard device_guard(device_of(x));\n  int outW =\n      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;\n  int outH =\n      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;\n  TORCH_CHECK(outW >= 1 && outH >= 1, \"output must be at least 1x1\");\n  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},\n                                 x.options(), x.suggest_memory_format());\n  TORCH_CHECK(y.numel() <= INT_MAX, \"output is too large\");\n  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +\n                      (y.size(2) - 1) * y.stride(2) +\n                      (y.size(3) - 1) * y.stride(3) <=\n                  INT_MAX,\n              \"output memory footprint is too large\");\n\n  // Initialize MUSA kernel parameters.\n  upfirdn2d_kernel_params p;\n  p.x = x.data_ptr();\n  p.f = f.data_ptr<float>();\n  p.y = y.data_ptr();\n  p.up = make_int2(upx, upy);\n  p.down = make_int2(downx, downy);\n  p.pad0 = make_int2(padx0, pady0);\n  p.flip = (flip) ? 1 : 0;\n  p.gain = gain;\n  p.inSize =\n      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));\n  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),\n                         (int)x.stride(0));\n  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));\n  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));\n  p.outSize =\n      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));\n  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),\n                          (int)y.stride(0));\n  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;\n  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;\n\n  // Choose MUSA kernel.\n  upfirdn2d_kernel_spec spec;\n  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), \"upfirdn2d_musa\", [&] {\n    spec = choose_upfirdn2d_kernel<scalar_t>(p);\n  });\n\n  // Set looping options.\n  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;\n  p.loopMinor = spec.loopMinor;\n  p.loopX = spec.loopX;\n  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;\n  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;\n\n  // Compute grid size.\n  dim3 blockSize, gridSize;\n  if (spec.tileOutW < 0)  // large\n  {\n    blockSize = dim3(4, 32, 1);\n    gridSize =\n        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,\n             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);\n  } else  // small\n  {\n    blockSize = dim3(256, 1, 1);\n    gridSize =\n        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,\n             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);\n  }\n\n  // Launch MUSA kernel.\n  void *args[] = {&p};\n#ifdef MMCV_WITH_HIP\n  AT_MUSA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,\n                                c10::musa::getCurrentMUSAStream()));\n#else\n  AT_MUSA_CHECK(musaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,\n                                 c10::musa::getCurrentMUSAStream()));\n#endif\n\n  return y;\n}\n#else\n#warning \"upfirdn2d is supported when MUSA_ARCH > 21\"\n#endif  //MUSA_ARCH\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/musa/voxelization_musa.mu",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"pytorch_musa_helper.hpp\"\n#include \"voxelization_musa_kernel.muh\"\n\nint HardVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  // current version tooks about 0.04s for one frame on cpu\n  // check device\n\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  // map points to voxel coors\n  at::Tensor temp_coors =\n      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));\n\n  dim3 grid(std::min(at::musa::ATenCeilDiv(num_points, 512), 4096));\n  dim3 block(512);\n\n  // 1. link point to corresponding voxel coors\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"hard_voxelize_kernel\", ([&] {\n        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(\n            points.contiguous().data_ptr<scalar_t>(),\n            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,\n            NDim);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  // 2. map point to the idx of the corresponding voxel, find duplicate coor\n  // create some temporary variables\n  auto point_to_pointidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n  auto point_to_voxelidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n\n  dim3 map_grid(std::min(at::musa::ATenCeilDiv(num_points, 512), 4096));\n  dim3 map_block(512);\n\n  AT_DISPATCH_ALL_TYPES(\n      temp_coors.scalar_type(), \"determin_duplicate\", ([&] {\n        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(\n            temp_coors.contiguous().data_ptr<int>(),\n            point_to_voxelidx.contiguous().data_ptr<int>(),\n            point_to_pointidx.contiguous().data_ptr<int>(), max_points,\n            max_voxels, num_points, NDim);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  // 3. determine voxel num and voxel's coor index\n  // make the logic in the MUSA device could accelerate about 10 times\n  auto coor_to_voxelidx = -at::ones(\n      {\n          num_points,\n      },\n      points.options().dtype(at::kInt));\n  auto voxel_num = at::zeros(\n      {\n          1,\n      },\n      points.options().dtype(at::kInt));  // must be zero from the beginning\n\n  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), \"determin_duplicate\", ([&] {\n                          determin_voxel_num<int><<<1, 1, 0, stream>>>(\n                              num_points_per_voxel.contiguous().data_ptr<int>(),\n                              point_to_voxelidx.contiguous().data_ptr<int>(),\n                              point_to_pointidx.contiguous().data_ptr<int>(),\n                              coor_to_voxelidx.contiguous().data_ptr<int>(),\n                              voxel_num.contiguous().data_ptr<int>(),\n                              max_points, max_voxels, num_points);\n                        }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  // 4. copy point features to voxels\n  // Step 4 & 5 could be parallel\n  auto pts_output_size = num_points * num_features;\n  dim3 cp_grid(std::min(at::musa::ATenCeilDiv(pts_output_size, 512), 4096));\n  dim3 cp_block(512);\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(\n            pts_output_size, points.contiguous().data_ptr<float>(),\n            point_to_voxelidx.contiguous().data_ptr<int>(),\n            coor_to_voxelidx.contiguous().data_ptr<int>(),\n            voxels.contiguous().data_ptr<float>(), max_points, num_features,\n            num_points, NDim);\n      }));\n  //   musaDeviceSynchronize();\n  //   AT_MUSA_CHECK(musaGetLastError());\n\n  // 5. copy coors of each voxels\n  auto coors_output_size = num_points * NDim;\n  dim3 coors_cp_grid(\n      std::min(at::musa::ATenCeilDiv(coors_output_size, 512), 4096));\n  dim3 coors_cp_block(512);\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        assign_voxel_coors<float, int>\n            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(\n                coors_output_size, temp_coors.contiguous().data_ptr<int>(),\n                point_to_voxelidx.contiguous().data_ptr<int>(),\n                coor_to_voxelidx.contiguous().data_ptr<int>(),\n                coors.contiguous().data_ptr<int>(), num_points, NDim);\n      }));\n\n  AT_MUSA_CHECK(musaGetLastError());\n\n  auto voxel_num_cpu = voxel_num.to(at::kCPU);\n  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];\n\n  return voxel_num_int;\n}\n\nint NondeterministicHardVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  if (num_points == 0) return 0;\n\n  dim3 blocks(\n      std::min(at::musa::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));\n  dim3 threads(THREADS_PER_BLOCK);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  // map points to voxel coors\n  at::Tensor temp_coors =\n      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));\n\n  // 1. link point to corresponding voxel coors\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"hard_voxelize_kernel\", ([&] {\n        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(\n            points.contiguous().data_ptr<scalar_t>(),\n            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,\n            NDim);\n      }));\n\n  at::Tensor coors_map;\n  at::Tensor reduce_count;\n\n  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);\n\n  std::tie(temp_coors, coors_map, reduce_count) =\n      at::unique_dim(coors_clean, 0, true, true, false);\n\n  if (temp_coors[0][0].lt(0).item<bool>()) {\n    // the first element of temp_coors is (-1,-1,-1) and should be removed\n    temp_coors = temp_coors.slice(0, 1);\n    coors_map = coors_map - 1;\n  }\n\n  int num_coors = temp_coors.size(0);\n  temp_coors = temp_coors.to(at::kInt);\n  coors_map = coors_map.to(at::kInt);\n\n  at::Tensor coors_count = at::zeros({1}, coors_map.options());\n  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());\n  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());\n  reduce_count = at::zeros({num_coors}, coors_map.options());\n\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"get_assign_pos\", ([&] {\n        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(\n            num_points, coors_map.contiguous().data_ptr<int32_t>(),\n            pts_id.contiguous().data_ptr<int32_t>(),\n            coors_count.contiguous().data_ptr<int32_t>(),\n            reduce_count.contiguous().data_ptr<int32_t>(),\n            coors_order.contiguous().data_ptr<int32_t>());\n      }));\n\n  AT_DISPATCH_ALL_TYPES(\n      points.scalar_type(), \"assign_point_to_voxel\", ([&] {\n        nondeterministic_assign_point_voxel<scalar_t>\n            <<<blocks, threads, 0, stream>>>(\n                num_points, points.contiguous().data_ptr<scalar_t>(),\n                coors_map.contiguous().data_ptr<int32_t>(),\n                pts_id.contiguous().data_ptr<int32_t>(),\n                temp_coors.contiguous().data_ptr<int32_t>(),\n                reduce_count.contiguous().data_ptr<int32_t>(),\n                coors_order.contiguous().data_ptr<int32_t>(),\n                voxels.contiguous().data_ptr<scalar_t>(),\n                coors.contiguous().data_ptr<int32_t>(),\n                num_points_per_voxel.contiguous().data_ptr<int32_t>(),\n                max_voxels, max_points, num_features, NDim);\n      }));\n  AT_MUSA_CHECK(musaGetLastError());\n  return max_voxels < num_coors ? max_voxels : num_coors;\n}\n\nvoid DynamicVoxelizeForwardMUSAKernelLauncher(\n    const at::Tensor &points, at::Tensor &coors,\n    const std::vector<float> voxel_size, const std::vector<float> coors_range,\n    const int NDim = 3) {\n  // current version tooks about 0.04s for one frame on cpu\n  // check device\n\n  c10::musa::MUSAGuard device_guard(points.device());\n  musaStream_t stream = c10::musa::getCurrentMUSAStream();\n\n  const int num_points = points.size(0);\n  const int num_features = points.size(1);\n\n  const float voxel_x = voxel_size[0];\n  const float voxel_y = voxel_size[1];\n  const float voxel_z = voxel_size[2];\n  const float coors_x_min = coors_range[0];\n  const float coors_y_min = coors_range[1];\n  const float coors_z_min = coors_range[2];\n  const float coors_x_max = coors_range[3];\n  const float coors_y_max = coors_range[4];\n  const float coors_z_max = coors_range[5];\n\n  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);\n  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);\n  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);\n\n  const int col_blocks = at::musa::ATenCeilDiv(num_points, THREADS_PER_BLOCK);\n  dim3 blocks(col_blocks);\n  dim3 threads(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_ALL_TYPES(points.scalar_type(), \"dynamic_voxelize_kernel\", [&] {\n    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(\n        points.contiguous().data_ptr<scalar_t>(),\n        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,\n        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,\n        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);\n  });\n\n  AT_MUSA_CHECK(musaGetLastError());\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/nms.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n#include <torch/csrc/utils/pybind.h>\n\n#include \"csrc_dipu/base/basedef.h\"\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n#include \"csrc_dipu/runtime/device/deviceapis.h\"\n#include \"csrc_dipu/utils/helpfunc.hpp\"\n\nusing dipu::VENDOR_TYPE;\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);\n}\n\nTensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,\n                    float iou_threshold, float sigma, float min_score,\n                    int method, int offset) {\n  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,\n                              sigma, min_score, method, offset);\n}\n\nstd::vector<std::vector<int> > nms_match_impl(Tensor dets,\n                                              float iou_threshold) {\n  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);\n}\n\n#ifdef MMCV_WITH_DIOPI\nTensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  auto boxes_p = toDiopiTensorHandle(boxes);\n  diopiDevice_t device;\n  diopiGetTensorDevice(boxes_p, &device);\n  if (device == diopi_host) {\n    return nms_impl(boxes, scores, iou_threshold, offset);\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  Tensor out;\n  auto outp = toDiopiTensorHandle(out);\n  diopiTensorHandle_t* outhandle = &outp;\n  auto scores_p = toDiopiTensorHandle(scores);\n  bool is_mock_cuda = boxes.device().type() == dipu::DIPU_DEVICE_TYPE;\n  if (is_mock_cuda && reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret =\n          diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);\n      if (ret == diopiSuccess) {\n        auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);\n        return *tensorhandle;\n      }\n    } else {\n      auto ret =\n          diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);\n      if (ret == diopiSuccess) {\n        auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);\n        return *tensorhandle;\n      }\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op nms\";\n  auto boxes_cpu = boxes.cpu();\n  auto scores_cpu = scores.cpu();\n  return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset);\n}\n#endif\n\nTensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n#ifdef MMCV_WITH_DIOPI\n  return nms_diopi(boxes, scores, iou_threshold, offset);\n#else\n  return nms_impl(boxes, scores, iou_threshold, offset);\n#endif\n}\n\nTensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,\n               float sigma, float min_score, int method, int offset) {\n  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,\n                      method, offset);\n}\n\nstd::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {\n  return nms_match_impl(dets, iou_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/nms_quadri.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n#include \"pytorch_cpp_helper.hpp\"\n\nTensor nms_quadri_cpu(const Tensor dets, const Tensor scores,\n                      const float iou_threshold);\n\n#ifdef MMCV_WITH_CUDA\nTensor nms_quadri_cuda(const Tensor dets, const Tensor scores,\n                       const Tensor order, const Tensor dets_sorted,\n                       const float iou_threshold, const int multi_label);\n#elif MMCV_WITH_MUSA\nTensor nms_quadri_musa(const Tensor dets, const Tensor scores,\n                       const Tensor order, const Tensor dets_sorted,\n                       const float iou_threshold, const int multi_label);\n#endif\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nTensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,\n                  const Tensor dets_sorted, const float iou_threshold,\n                  const int multi_label) {\n  assert(dets.device().is_cuda() == scores.device().is_cuda());\n  if (dets.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    return nms_quadri_cuda(dets, scores, order, dets_sorted, iou_threshold,\n                           multi_label);\n#elif MMCV_WITH_MUSA\n    return nms_quadri_musa(dets, scores, order, dets_sorted, iou_threshold,\n                           multi_label);\n#else\n    AT_ERROR(\"Not compiled with GPU support\");\n#endif\n  }\n\n  return nms_quadri_cpu(dets, scores, iou_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/nms_rotated.cpp",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n// modified from\n// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h\n#include \"pytorch_cpp_helper.hpp\"\n\nTensor nms_rotated_cpu(const Tensor dets, const Tensor scores,\n                       const float iou_threshold);\n\n#ifdef MMCV_WITH_CUDA\nTensor nms_rotated_cuda(const Tensor dets, const Tensor scores,\n                        const Tensor order, const Tensor dets_sorted,\n                        const float iou_threshold, const int multi_label);\n#endif\n\n#ifdef MMCV_WITH_NPU\nTensor nms_rotated_npu(const Tensor dets, const Tensor scores,\n                       const Tensor labels, const float iou_threshold);\n#endif\n\n#ifdef MMCV_WITH_MLU\nTensor nms_rotated_mlu(const Tensor dets, const Tensor scores,\n                       const float iou_threshold);\n#endif\n\n#ifdef MMCV_WITH_MUSA\nTensor nms_rotated_musa(const Tensor dets, const Tensor scores,\n                        const Tensor order, const Tensor dets_sorted,\n                        const float iou_threshold, const int multi_label);\n#endif\n\n// Interface for Python\n// inline is needed to prevent multiple function definitions when this header is\n// included by different cpps\nTensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,\n                   const Tensor dets_sorted, const Tensor labels,\n                   const float iou_threshold, const int multi_label) {\n  assert(dets.device().is_cuda() == scores.device().is_cuda());\n  if (dets.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),\n                            iou_threshold, multi_label);\n#else\n    AT_ERROR(\"Not compiled with GPU support\");\n#endif\n#ifdef MMCV_WITH_XLA\n  } else if (dets.device().type() == at::kXLA) {\n    return nms_rotated_npu(dets, scores, labels, iou_threshold);\n#endif\n#ifdef MMCV_WITH_KPRIVATE\n  } else if (dets.device().type() == at::kPrivateUse1) {\n    return nms_rotated_npu(dets, scores, labels, iou_threshold);\n#endif\n#ifdef MMCV_WITH_MLU\n  } else if (dets.device().type() == at::kMLU) {\n    return nms_rotated_mlu(dets, scores, iou_threshold);\n#endif\n#ifdef MMCV_WITH_MUSA\n  } else if (dets.device().type() == ::at::kPrivateUse1) {\n    return nms_rotated_musa(dets, scores, order, dets_sorted.contiguous(),\n                            iou_threshold, multi_label);\n#endif\n  }\n\n  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid active_rotated_filter_forward_impl(const Tensor input,\n                                        const Tensor indices, Tensor output);\n\nvoid active_rotated_filter_backward_impl(const Tensor grad_out,\n                                         const Tensor indices, Tensor grad_in);\n\nvoid active_rotated_filter_forward_npu(const Tensor input, const Tensor indices,\n                                       Tensor output) {\n  OpCommand cmd;\n  cmd.Name(\"ActiveRotatedFilter\")\n      .Input(input)\n      .Input(indices)\n      .Output(output)\n      .Run();\n}\n\nvoid active_rotated_filter_backward_npu(const Tensor grad_out,\n                                        const Tensor indices, Tensor grad_in) {\n  OpCommand cmd;\n  cmd.Name(\"ActiveRotatedFilterGrad\")\n      .Input(grad_out)\n      .Input(indices)\n      .Output(grad_in)\n      .Run();\n}\n\nREGISTER_NPU_IMPL(active_rotated_filter_forward_impl,\n                  active_rotated_filter_forward_npu);\n\nREGISTER_NPU_IMPL(active_rotated_filter_backward_impl,\n                  active_rotated_filter_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid assign_score_withk_forward_npu(int B, int N0, int N1, int M, int K, int O,\n                                    int aggregate, const Tensor& points,\n                                    const Tensor& centers, const Tensor& scores,\n                                    const Tensor& knn_idx, Tensor& output) {\n  at::Tensor points_trans = points.permute({0, 3, 1, 2});\n  at::Tensor centers_trans = centers.permute({0, 3, 1, 2});\n  EXEC_NPU_CMD(aclnnAssignScoreWithk, points_trans, centers_trans, scores,\n               knn_idx, B, N0, N1, M, K, O, aggregate, output);\n}\n\nvoid assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,\n                                     int aggregate, const Tensor& points,\n                                     const Tensor& centers,\n                                     const Tensor& scores,\n                                     const Tensor& knn_idx, Tensor& output);\n\nREGISTER_NPU_IMPL(assign_score_withk_forward_impl,\n                  assign_score_withk_forward_npu);\n\nvoid assign_score_withk_backward_npu(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores) {\n  at::Tensor grad_out_trans = grad_out.permute({0, 2, 3, 1});\n\n  EXEC_NPU_CMD(aclnnAssignScoreWithkGrad, grad_out_trans, points, centers,\n               scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_scores,\n               grad_points, grad_centers);\n}\n\nvoid assign_score_withk_backward_impl(\n    int B, int N0, int N1, int M, int K, int O, int aggregate,\n    const Tensor& grad_out, const Tensor& points, const Tensor& centers,\n    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,\n    Tensor& grad_centers, Tensor& grad_scores);\n\nREGISTER_NPU_IMPL(assign_score_withk_backward_impl,\n                  assign_score_withk_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/ball_query_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid ball_query_forward_npu(int b, int n, int m, float min_radius,\n                            float max_radius, int nsample, const Tensor new_xyz,\n                            const Tensor xyz, Tensor idx) {\n  int64_t nsample_i64 = nsample;\n\n  // transpose new_xyz from [B, M, 3] to [M, B, 3]\n  at::Tensor new_xyz_transpose = new_xyz.transpose(0, 1).to(at::kFloat);\n\n  // transpose xyz from [B, N, 3] to [B, 3, N]\n  at::Tensor xyz_transpose = xyz.transpose(1, 2).to(at::kFloat);\n\n  // transpose idx from [B, M, nsample] to [M, B, nsample]\n  at::Tensor idx_transpose = idx.transpose(0, 1).contiguous();\n\n  OpCommand cmd;\n  cmd.Name(\"BallQuery\")\n      .Input(xyz_transpose)\n      .Input(new_xyz_transpose)\n      .Output(idx_transpose)\n      .Attr(\"min_radius\", min_radius)\n      .Attr(\"max_radius\", max_radius)\n      .Attr(\"sample_num\", nsample_i64)\n      .Run();\n\n  idx_transpose = idx_transpose.transpose(0, 1).contiguous();\n  idx.copy_(idx_transpose);\n}\n\nvoid ball_query_forward_impl(int b, int n, int m, float min_radius,\n                             float max_radius, int nsample,\n                             const Tensor new_xyz, const Tensor xyz,\n                             Tensor idx);\n\nREGISTER_NPU_IMPL(ball_query_forward_impl, ball_query_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                        const int mode, const bool aligned, const int offset);\n\nvoid bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                       const int mode, const bool aligned, const int offset) {\n  string modeStr = \"iou\";\n  if (mode == 1) {\n    modeStr = \"iof\";\n  }\n  bool swap_flag = false;\n  at::Tensor bboxesFP32 = bboxes2;\n  at::Tensor gtboxesFP32 = bboxes1;\n  if (bboxes2.size(0) < bboxes1.size(0)) {\n    swap_flag = true;\n    bboxesFP32 = bboxes1;\n    gtboxesFP32 = bboxes2;\n  }\n  if (bboxes2.scalar_type() != at::kFloat) {\n    bboxesFP32 = bboxesFP32.to(at::kFloat);\n    gtboxesFP32 = gtboxesFP32.to(at::kFloat);\n  }\n  c10::SmallVector<int64_t, 8> iousSize = {gtboxesFP32.size(0),\n                                           bboxesFP32.size(0)};\n  if (aligned) {\n    iousSize = {gtboxesFP32.size(0), 1};\n  }\n  at::Tensor iousFP32 = at::empty(iousSize, bboxesFP32.options());\n  bboxesFP32 = aligned ? bboxesFP32.transpose(0, 1) : bboxesFP32;\n  gtboxesFP32 = aligned ? gtboxesFP32.transpose(0, 1) : gtboxesFP32;\n  OpCommand cmd;\n  cmd.Name(\"Iou\")\n      .Input(bboxesFP32)\n      .Input(gtboxesFP32)\n      .Output(iousFP32)\n      .Attr(\"mode\", modeStr)\n      .Attr(\"eps\", (float)offset)\n      .Attr(\"aligned\", aligned)\n      .Run();\n  if (bboxes2.scalar_type() != at::kFloat) {\n    iousFP32 = iousFP32.to(at::kHalf);\n  }\n  iousFP32 = swap_flag ? iousFP32.transpose(0, 1) : iousFP32;\n  ious.copy_(iousFP32);\n}\n\nREGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid border_align_forward_impl(const Tensor &input, const Tensor &boxes,\n                               Tensor output, Tensor argmax_idx,\n                               const int pool_size);\n\nvoid border_align_forward_npu(const Tensor &input, const Tensor &boxes,\n                              Tensor output, Tensor argmax_idx,\n                              const int pool_size) {\n  TORCH_CHECK(input.size(0) == boxes.size(0),\n              \"The batch sizes of feature map and rois must be the same.\");\n  TORCH_CHECK(input.size(1) % 4 == 0,\n              \"The number of channels must be divisible by 4.\");\n  TORCH_CHECK(pool_size >= 2, \"The pool size should be larger than 2.\");\n  int32_t batch_size = input.size(0);\n  int32_t channels = input.size(1);\n  int32_t height = input.size(2);\n  int32_t width = input.size(3);\n  at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous();\n  at::Tensor rois_map = boxes.contiguous();\n  at::Tensor temp_tensor = at::zeros(\n      {batch_size, height * width, pool_size + 1, channels}, input.options());\n  EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor);\n  auto max_result = temp_tensor.max(-2);\n  at::Tensor output_ = std::get<0>(max_result).to(at::kFloat);\n  output_ = output_.reshape({batch_size, height * width, 4, channels / 4})\n                .permute({0, 3, 1, 2})\n                .contiguous();\n  output.copy_(output_);\n  at::Tensor argmax_idx_ = std::get<1>(max_result).to(at::kInt);\n  argmax_idx_ =\n      argmax_idx_.reshape({batch_size, height * width, 4, channels / 4})\n          .permute({0, 3, 1, 2})\n          .contiguous();\n  argmax_idx.copy_(argmax_idx_);\n}\nREGISTER_NPU_IMPL(border_align_forward_impl, border_align_forward_npu);\n\nvoid border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,\n                                const Tensor &argmax_idx, Tensor grad_input,\n                                const int pool_size);\n\nvoid border_align_backward_npu(const Tensor &grad_output, const Tensor &boxes,\n                               const Tensor &argmax_idx, Tensor grad_input,\n                               const int pool_size) {\n  TORCH_CHECK(grad_output.dim() == 4,\n              \"grad_out.dim() must be 4, but got: \", grad_output.dim());\n  TORCH_CHECK(boxes.dim() == 3, \"idx.dim() must be 3, but got: \", boxes.dim());\n  TORCH_CHECK(argmax_idx.dim() == 4,\n              \"argmax_idx.dim() must be 4, but got: \", argmax_idx.dim());\n\n  int32_t batch_size = grad_output.size(0);\n  int32_t feat_channels = grad_output.size(1) * 4;\n  int32_t channels = grad_output.size(1);\n  int32_t box_size = boxes.size(1);\n  int32_t height = grad_input.size(2);\n  int32_t width = grad_input.size(3);\n\n  EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels,\n               box_size, height, width, pool_size, batch_size, grad_input);\n}\nREGISTER_NPU_IMPL(border_align_backward_impl, border_align_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned);\n\nvoid box_iou_quadri_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                        const int mode_flag, const bool aligned) {\n  TORCH_CHECK(boxes1.size(1) == 8, \"boxes1 must be 2D tensor (N, 8)\");\n  TORCH_CHECK(boxes1.size(1) == 8, \"boxes1 must be 2D tensor (N, 8)\");\n\n  EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious);\n  return;\n}\n\nREGISTER_NPU_IMPL(box_iou_quadri_impl, box_iou_quadri_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                          const int mode_flag, const bool aligned);\n\nvoid box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                         const int mode_flag, const bool aligned) {\n  TORCH_CHECK(boxes1.size(1) == 5, \"boxes1 must be 2D tensor (N, 5)\");\n  TORCH_CHECK(boxes2.size(1) == 5, \"boxes2 must be 2D tensor (N, 5)\");\n  EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious);\n  return;\n}\n\nREGISTER_NPU_IMPL(box_iou_rotated_impl, box_iou_rotated_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nnamespace {\nconstexpr int32_t MODE_FLAG_OVERLAP = 0;\nconstexpr int32_t FORMAT_FLAG_XYZWHDR = 3;\n};  // namespace\n\nvoid iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,\n                                          const int num_b, const Tensor boxes_b,\n                                          Tensor ans_overlap);\n\nvoid iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a,\n                                         const int num_b, const Tensor boxes_b,\n                                         Tensor ans_overlap) {\n  TORCH_CHECK(boxes_a.size(1) == 7, \"boxes_a must be 2D tensor (N, 7)\");\n  TORCH_CHECK(boxes_b.size(1) == 7, \"boxes_b must be 2D tensor (N, 7)\");\n\n  auto clockwise = true;\n  bool aligned = false;\n  double margin = 1e-5;\n  int32_t mode_flag = MODE_FLAG_OVERLAP;\n  int32_t format_flag = FORMAT_FLAG_XYZWHDR;\n\n  EXEC_NPU_CMD(aclnnBoxesOverlapBevV1, boxes_a, boxes_b, format_flag, clockwise,\n               mode_flag, aligned, margin, ans_overlap);\n  return;\n}\n\nREGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl,\n                  iou3d_boxes_overlap_bev_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,\n                                  Tensor dist2, Tensor idx1, Tensor idx2) {\n  at::Tensor xyz1 = at::ones_like(XYZ1);\n  at::Tensor xyz2 = at::ones_like(XYZ2);\n  xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);\n  xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);\n  OpCommand cmd;\n  cmd.Name(\"ChamferDistance\")\n      .Input(xyz1)\n      .Input(xyz2)\n      .Output(dist1)\n      .Output(dist2)\n      .Output(idx1)\n      .Output(idx2)\n      .Run();\n}\n\nvoid chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,\n                                   Tensor idx2, Tensor grad_dist1,\n                                   Tensor grad_dist2, Tensor grad_xyz1,\n                                   Tensor grad_xyz2) {\n  EXEC_NPU_CMD(aclnnChamferDistanceBackward, xyz1, xyz2, idx1, idx2, grad_dist1,\n               grad_dist2, grad_xyz1, grad_xyz2);\n}\n\nvoid chamfer_distance_forward_impl(Tensor XYZ1, Tensor XYZ2, Tensor dist1,\n                                   Tensor dist2, Tensor idx1, Tensor idx2);\nREGISTER_NPU_IMPL(chamfer_distance_forward_impl, chamfer_distance_forward_npu);\n\nvoid chamfer_distance_backward_impl(Tensor xyz1, Tensor xyz2, Tensor idx1,\n                                    Tensor idx2, Tensor grad_dist1,\n                                    Tensor grad_dist2, Tensor grad_xyz1,\n                                    Tensor grad_xyz2);\nREGISTER_NPU_IMPL(chamfer_distance_backward_impl,\n                  chamfer_distance_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/common_util.h",
    "content": "#ifndef MMCV_OPS_CSRC_COMMON__UTIL_HPP_\n#define MMCV_OPS_CSRC_COMMON__UTIL_HPP_\n\nconst int SIZE = 8;\n\n#endif  // MMCV_OPS_CSRC_COMMON__UTIL_HPP_\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,\n                                  Tensor output, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma);\n\nvoid deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,\n                                   Tensor rois, Tensor offset,\n                                   Tensor grad_input, Tensor grad_offset,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   float gamma);\n\nvoid deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,\n                                 Tensor output, int pooled_height,\n                                 int pooled_width, float spatial_scale,\n                                 int sampling_ratio, float gamma) {\n  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};\n  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);\n  int64_t sampling_ratio_ = (int64_t)sampling_ratio;\n  OpCommand cmd;\n  cmd.Name(\"DeformableRoiPool\")\n      .Input(input)\n      .Input(rois)\n      .Input(offset)\n      .Output(output)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"output_size\", output_size)\n      .Attr(\"sampling_ratio\", sampling_ratio_)\n      .Attr(\"gamma\", gamma)\n      .Run();\n}\n\nvoid deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,\n                                  Tensor offset, Tensor grad_input,\n                                  Tensor grad_offset, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int sampling_ratio, float gamma) {\n  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};\n  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);\n  int64_t sampling_ratio_ = (int64_t)sampling_ratio;\n  OpCommand cmd;\n  cmd.Name(\"DeformableRoiPoolGrad\")\n      .Input(grad_output)\n      .Input(input)\n      .Input(rois)\n      .Input(offset)\n      .Output(grad_input)\n      .Output(grad_offset)\n      .Attr(\"output_size\", output_size)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"sampling_ratio\", sampling_ratio_)\n      .Attr(\"gamma\", gamma)\n      .Run();\n}\n\nREGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);\n\nREGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nTensor diff_iou_rotated_sort_vertices_npu(Tensor vertices, Tensor mask,\n                                          Tensor num_valid) {\n  TORCH_CHECK(vertices.dim() == 4,\n              \"vertices must be a 4D Tensor, but got: \", vertices.dim());\n  TORCH_CHECK(mask.dim() == 3,\n              \"mask must be a 3D Tensor, but got: \", mask.dim());\n  TORCH_CHECK(num_valid.dim() == 2,\n              \"num_valid must be a 2D Tensor, but got: \", num_valid.dim());\n\n  uint32_t B = vertices.size(0);\n  uint32_t N = vertices.size(1);\n\n  at::Tensor sortedIdx = at::empty({B, N, 9}, num_valid.options());\n  at::Tensor mask_fp = mask.to(at::kFloat);\n\n  EXEC_NPU_CMD(aclnnDiffIouRotatedSortVertices, vertices, mask_fp, num_valid,\n               sortedIdx);\n\n  return sortedIdx;\n}\n\nTensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,\n                                                   Tensor num_valid);\n\nREGISTER_NPU_IMPL(diff_iou_rotated_sort_vertices_forward_impl,\n                  diff_iou_rotated_sort_vertices_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,\n                                    Tensor output, float gamma, float alpha) {\n  int64_t n_class = input.size(1);\n  at::Tensor target_y = at::ones_like(input);\n  if (n_class == 1) {\n    target_y = at::reshape(target, input.sizes());\n    target_y = at::mul(target_y, -1.0);\n    target_y = at::add(target_y, 1.0);\n  } else {\n    target_y = at::one_hot(target, n_class);\n  }\n  target_y = target_y.to(at::kInt);\n  int64_t weight_size = weight.size(0);\n  at::Tensor weight_y = at::ones_like(input);\n  if (weight_size > 0) {\n    at::Tensor weight_selected = weight.gather(0, target);\n    weight_selected = weight_selected.unsqueeze(1);\n    weight_y = weight_selected.expand_as(input);\n  }\n  OpCommand cmd;\n  string reduction = \"none\";\n  cmd.Name(\"SigmoidFocalLoss\")\n      .Input(input)\n      .Input(target_y)\n      .Input(weight_y)\n      .Output(output)\n      .Attr(\"gamma\", gamma)\n      .Attr(\"alpha\", alpha)\n      .Attr(\"reduction\", reduction)\n      .Run();\n}\n\nvoid sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,\n                                     Tensor grad_input, float gamma,\n                                     float alpha) {\n  int64_t n_class = input.size(1);\n  at::Tensor target_y = at::ones_like(input);\n  if (n_class == 1) {\n    target_y = at::reshape(target, input.sizes());\n  } else {\n    target_y = at::one_hot(target, n_class);\n    target_y = at::mul(target_y, -1.0);\n    target_y = at::add(target_y, 1.0);\n  }\n  target_y = target_y.to(at::kInt);\n  at::Tensor grad_up = at::ones_like(input);\n  int64_t weight_size = weight.size(0);\n  at::Tensor weight_y = at::ones_like(input);\n  if (weight_size > 0) {\n    weight_y = at::broadcast_to(weight, input.sizes());\n  }\n  OpCommand cmd;\n  string reduction = \"none\";\n  cmd.Name(\"SigmoidFocalLossGrad\")\n      .Input(input)\n      .Input(target_y)\n      .Input(grad_up)\n      .Input(weight_y)\n      .Output(grad_input)\n      .Attr(\"gamma\", gamma)\n      .Attr(\"alpha\", alpha)\n      .Attr(\"reduction\", reduction)\n      .Run();\n}\n\nvoid sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor grad_input,\n                                      float gamma, float alpha);\n\nvoid softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,\n                                    Tensor output, float gamma, float alpha) {\n  int64_t n_class = input.size(1);\n  at::Tensor target_y = at::one_hot(target, n_class);\n  target_y = target_y.to(at::kInt);\n  int64_t weight_size = weight.size(0);\n  at::Tensor weight_y = at::ones_like(input);\n  if (weight_size > 0) {\n    weight_y = at::broadcast_to(weight, input.sizes());\n  }\n  at::Tensor op_output = at::ones_like(input);\n  OpCommand cmd;\n  string reduction = \"none\";\n  cmd.Name(\"SoftmaxFocalLoss\")\n      .Input(input)\n      .Input(target_y)\n      .Input(weight_y)\n      .Output(op_output)\n      .Attr(\"gamma\", gamma)\n      .Attr(\"alpha\", alpha)\n      .Attr(\"reduction\", reduction)\n      .Run();\n  int64_t n_batch = input.size(0);\n  c10::SmallVector<int64_t, 2> offsets = {0, 0};\n  c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};\n  at::IntArrayRef offset = at::IntArrayRef(offsets);\n  at::IntArrayRef size = at::IntArrayRef(sizes);\n  at::IntArrayRef size_array = at::IntArrayRef(sizes);\n  c10::SmallVector<int64_t, 8> offsetVec;\n  for (uint64_t i = 0; i < offset.size(); i++) {\n    offsetVec.emplace_back(offset[i]);\n  }\n  c10::SmallVector<int64_t, 8> sizeVec;\n  for (uint64_t i = 0; i < size_array.size(); i++) {\n    sizeVec.emplace_back(size_array[i]);\n  }\n  OpCommand cmd2;\n  cmd2.Name(\"Slice\")\n      .Input(op_output)\n      .Input(offsetVec)\n      .Input(sizeVec)\n      .Output(output)\n      .Run();\n}\n\nvoid softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,\n                                     Tensor grad_input, float gamma,\n                                     float alpha);\n\nvoid softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,\n                                     Tensor buff, Tensor grad_input,\n                                     float gamma, float alpha) {\n  int64_t n_class = input.size(1);\n  at::Tensor target_y = at::one_hot(target, n_class);\n  target_y = target_y.to(at::kInt);\n  at::Tensor grad_up = at::ones_like(input);\n  int64_t weight_size = weight.size(0);\n  at::Tensor weight_y = at::ones_like(input);\n  if (weight_size > 0) {\n    weight_y = at::broadcast_to(weight, input.sizes());\n  }\n  OpCommand cmd;\n  string reduction = \"none\";\n  cmd.Name(\"SoftmaxFocalLossGrad\")\n      .Input(input)\n      .Input(target_y)\n      .Input(grad_up)\n      .Input(weight_y)\n      .Output(grad_input)\n      .Attr(\"gamma\", gamma)\n      .Attr(\"alpha\", alpha)\n      .Attr(\"reduction\", reduction)\n      .Run();\n}\n\nvoid softmax_focal_loss_backward_impl(Tensor input, Tensor target,\n                                      Tensor weight, Tensor buff,\n                                      Tensor grad_input, float gamma,\n                                      float alpha);\n\nREGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,\n                  sigmoid_focal_loss_forward_npu);\n\nREGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,\n                  sigmoid_focal_loss_backward_npu);\n\nREGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,\n                  softmax_focal_loss_forward_npu);\n\nREGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,\n                  softmax_focal_loss_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/furthest_point_sample_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid furthest_point_sampling_forward_npu(Tensor points_tensor,\n                                         Tensor temp_tensor, Tensor idx_tensor,\n                                         int b, int n, int m) {\n  TORCH_CHECK(\n      (points_tensor.sizes()[1] >= m),\n      \"the num of sampled points should smaller than total num of points.\");\n  at::Tensor points_xyz = points_tensor.transpose(1, 2).contiguous();\n  at::Tensor nearest_dist = temp_tensor.contiguous();\n  EXEC_NPU_CMD(aclnnFurthestPointSampling, points_xyz, nearest_dist, m,\n               idx_tensor);\n}\n\nvoid furthest_point_sampling_forward_impl(Tensor points_tensor,\n                                          Tensor temp_tensor, Tensor idx_tensor,\n                                          int b, int n, int m);\n\nREGISTER_NPU_IMPL(furthest_point_sampling_forward_impl,\n                  furthest_point_sampling_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/furthest_point_sampling_with_dist_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid furthest_point_sampling_with_dist_npu(Tensor points_tensor,\n                                           Tensor temp_tensor,\n                                           Tensor idx_tensor, int b, int n,\n                                           int m) {\n  TORCH_CHECK(\n      (points_tensor.sizes()[1] >= m),\n      \"the num of sampled points should smaller than total num of points.\");\n  EXEC_NPU_CMD(aclnnFurthestPointSamplingWithDist, points_tensor, temp_tensor,\n               m, idx_tensor);\n}\n\nvoid furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,\n                                                    Tensor temp_tensor,\n                                                    Tensor idx_tensor, int b,\n                                                    int n, int m);\n\nREGISTER_NPU_IMPL(furthest_point_sampling_with_dist_forward_impl,\n                  furthest_point_sampling_with_dist_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nTensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,\n                                    const Tensor &refer, int act, int grad,\n                                    float alpha, float scale);\n\nTensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,\n                                const Tensor &refer, int act, int grad,\n                                float alpha, float scale) {\n  at::Tensor py = at::empty_like(input);\n  // forward\n  if (grad == 0) {\n    auto input_size = input.sizes();\n    int input_length = input_size.size();\n    c10::SmallVector<int64_t, 8> input_size_tmp;\n    for (uint64_t i = 0; i < input_size.size(); i++) {\n      input_size_tmp.emplace_back(input_size[i]);\n    }\n    if (input_length > 1) {\n      for (int i = 0; i < input_length; i++) {\n        if (i != 1) {\n          input_size_tmp[i] = 1;\n        }\n      }\n    }\n    at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);\n    // at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(\n    //     bias_tmp, input.sizes());\n    at::Tensor bias_ = at::broadcast_to(bias_tmp, input.sizes());\n    OpCommand cmd;\n    cmd.Name(\"FusedBiasLeakyRelu\")\n        .Input(input)\n        .Input(bias_)\n        .Output(py)\n        .Attr(\"scale\", scale)\n        .Attr(\"negative_slope\", alpha)\n        .Run();\n  }\n\n  // backward\n  if (grad == 1) {\n    OpCommand cmd;\n    cmd.Name(\"FusedBiasLeakyReluGrad\")\n        .Input(input)\n        .Input(refer)\n        .Output(py)\n        .Attr(\"scale\", scale)\n        .Attr(\"negative_slope\", alpha)\n        .Run();\n  }\n  return py;\n}\n\nREGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid gather_points_forward_npu(int b, int c, int n, int npoints,\n                               const Tensor points, const Tensor idx,\n                               Tensor out) {\n  // b, c, n, and npoints do not need to be passed into gatherv2,\n  // b, c, n, and npoints are calculated inside the operator\n  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1\n  c10::SmallVector<int64_t, N> axis = {2};\n  int64_t batch_dims = 1;\n\n  OpCommand cmd;\n  cmd.Name(\"GatherV2\")\n      .Input(points)\n      .Input(idx)\n      .Input(axis)\n      .Output(out)\n      .Attr(\"batch_dims\", batch_dims)\n      .Run();\n}\nvoid gather_points_backward_npu(int b, int c, int n, int npoints,\n                                const Tensor grad_out, const Tensor idx,\n                                Tensor grad_points) {\n  at::Tensor indices = idx;\n  if (idx.scalar_type() != at::ScalarType::Int) {\n    indices = idx.to(at::kInt);\n  }\n  if (idx.dim() == 0) {\n    indices.unsqueeze_(0);\n  }\n  int64_t dim = 0;\n  auto shape = idx.sizes();\n  c10::SmallVector<int64_t, 8> pad_size;\n  for (uint64_t i = 0; i < shape.size(); i++) {\n    pad_size.emplace_back(shape[i]);\n  }\n  at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();\n  at::Tensor grad_points_view = trans_grad_points.view(\n      {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],\n       trans_grad_points.sizes()[2]});\n  at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();\n  trans_grad_out = trans_grad_out.view(\n      {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],\n       trans_grad_out.sizes()[2]});\n  auto index = at::arange(0, b);\n  index = index.to(grad_out.device());\n  index = at::mul(index, n);\n  index = index.view({b, 1});\n  index = at::broadcast_to(index, pad_size);\n  indices = at::add(index, indices);\n  indices = indices.view({-1});\n  OpCommand cmd;\n  cmd.Name(\"InplaceIndexAdd\")\n      .Input(grad_points_view)\n      .Input(indices)\n      .Input(trans_grad_out)\n      .Output(grad_points_view)\n      .Attr(\"axis\", dim)\n      .Run();\n  at::Tensor grad_points_result =\n      grad_points_view.view(trans_grad_points.sizes());\n  grad_points_result = grad_points_result.transpose(1, 2);\n  grad_points.copy_(grad_points_result);\n}\n\nvoid gather_points_forward_impl(int b, int c, int n, int npoints,\n                                const Tensor points, const Tensor idx,\n                                Tensor out);\nvoid gather_points_backward_impl(int b, int c, int n, int npoints,\n                                 const Tensor grad_out, const Tensor idx,\n                                 Tensor grad_points);\n\nREGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);\nREGISTER_NPU_IMPL(gather_points_backward_impl, gather_points_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid group_points_forward_npu(int b, int c, int n, int npoints, int nsample,\n                              const Tensor points, const Tensor idx,\n                              Tensor out) {\n  // b, c, n, and npoints do not need to be passed into gatherv2,\n  // b, c, n, and npoints are calculated inside the operator\n  // gatherv2 operator in ascend needs to set axis to 0, batch_dims is 0\n  c10::SmallVector<int64_t, N> axis = {0};\n  int64_t batch_dims = 0;\n\n  auto index = at::arange(0, b);\n  index = index.to(points.device());\n  index = index.view({-1, 1, 1});\n  index = at::mul(index, n);\n  at::Tensor indices = at::add(index, idx);\n  indices = indices.view({-1});\n\n  at::Tensor trans_features = points.transpose(1, 2);\n  at::Tensor features = trans_features.contiguous();\n  features = features.view({b * n, c});\n\n  OpCommand cmd;\n  cmd.Name(\"GatherV2\")\n      .Input(features)\n      .Input(indices)\n      .Input(axis)\n      .Output(out)\n      .Attr(\"batch_dims\", batch_dims)\n      .Run();\n\n  at::Tensor output =\n      out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);\n  at::Tensor res = output.contiguous();\n  out.copy_(res);\n}\n\nvoid group_points_backward_npu(int b, int c, int n, int npoints, int nsample,\n                               const Tensor grad_out, const Tensor idx,\n                               Tensor grad_features) {\n  at::Tensor trans_idx = idx.view({b * npoints * nsample});\n  at::Tensor trans_grad_out = grad_out.permute({0, 2, 3, 1});\n  at::Tensor grad_out_tensor = trans_grad_out.contiguous();\n  grad_out_tensor = grad_out_tensor.view({b * npoints * nsample, c});\n  at::Tensor out = at::zeros({b, n, c}, grad_out.options());\n\n  EXEC_NPU_CMD(aclnnGroupPointsGrad, grad_out_tensor, trans_idx, b, c, n,\n               npoints, nsample, out);\n\n  at::Tensor grad_points = out.transpose(1, 2);\n\n  grad_features.copy_(grad_points);\n}\n\nvoid group_points_forward_impl(int b, int c, int n, int npoints, int nsample,\n                               const Tensor points, const Tensor idx,\n                               Tensor out);\nvoid group_points_backward_impl(int b, int c, int n, int npoints, int nsample,\n                                const Tensor points, const Tensor idx,\n                                Tensor out);\n\nREGISTER_NPU_IMPL(group_points_forward_impl, group_points_forward_npu);\nREGISTER_NPU_IMPL(group_points_backward_impl, group_points_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/knn_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n#include \"torch_npu/csrc/aten/NPUNativeFunctions.h\"\n#include \"torch_npu/csrc/framework/utils/OpAdapter.h\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,\n                     const Tensor new_xyz, Tensor idx, Tensor dist2) {\n  // transpose known from [B, N, 3] to [B, 3, N]\n  at::Tensor source = xyz.transpose(2, 1).contiguous();\n  at::Tensor target = new_xyz.contiguous();\n\n  bool is_from_knn = true;\n  EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx);\n}\n\nvoid knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,\n                      const Tensor new_xyz, Tensor idx, Tensor dist2);\n\nREGISTER_NPU_IMPL(knn_forward_impl, knn_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nTensor ms_deform_attn_impl_forward(const Tensor &value,\n                                   const Tensor &value_spatial_shapes,\n                                   const Tensor &value_level_start_index,\n                                   const Tensor &sampling_locations,\n                                   const Tensor &attention_weights,\n                                   const int im2col_step);\n\nvoid check_support(const Tensor &value, const Tensor &attention_weights) {\n  TORCH_CHECK(\n      (value.scalar_type() == at::kFloat || value.scalar_type() == at::kHalf),\n      \"Dtype of value should be float32 or float16.\");\n  int64_t num_heads = value.size(2);\n  int64_t embed_dims = value.size(3);\n  int64_t num_points = attention_weights.size(4);\n  TORCH_CHECK((num_heads >= 4 && num_heads <= 8),\n              \"num_heads should be in the range of [4, 8]\");\n  TORCH_CHECK((embed_dims >= 32 && embed_dims <= 256),\n              \"embed_dims should be in the range of [32, 256]\");\n  TORCH_CHECK((num_points >= 4 && num_points <= 8),\n              \"num_points should be in the range of [4, 8]\");\n}\n\nTensor ms_deform_attn_forward_npu(const Tensor &value,\n                                  const Tensor &value_spatial_shapes,\n                                  const Tensor &value_level_start_index,\n                                  const Tensor &sampling_locations,\n                                  const Tensor &attention_weights,\n                                  const int im2col_step) {\n  check_support(value, attention_weights);\n  at::Tensor value_fp32 = value;\n  at::Tensor value_spatial_shapes_int32 = value_spatial_shapes;\n  at::Tensor value_level_start_index_int32 = value_level_start_index;\n  at::Tensor sampling_locations_fp32 = sampling_locations;\n  at::Tensor attention_weights_fp32 = attention_weights;\n  if (value.scalar_type() != at::kFloat) {\n    value_fp32 = value.to(at::kFloat);\n  }\n  if (value_spatial_shapes.scalar_type() != at::kInt) {\n    value_spatial_shapes_int32 = value_spatial_shapes.to(at::kInt);\n  }\n  if (value_level_start_index.scalar_type() != at::kInt) {\n    value_level_start_index_int32 = value_level_start_index.to(at::kInt);\n  }\n  if (sampling_locations.scalar_type() != at::kFloat) {\n    sampling_locations_fp32 = sampling_locations.to(at::kFloat);\n  }\n  if (attention_weights.scalar_type() != at::kFloat) {\n    attention_weights_fp32 = attention_weights.to(at::kFloat);\n  }\n\n  c10::SmallVector<int64_t, 3> output_size = {\n      value.size(0), sampling_locations.size(1), value.size(2) * value.size(3)};\n  at::Tensor output = at::zeros(output_size, value_fp32.options());\n\n  EXEC_NPU_CMD(aclnnMultiScaleDeformableAttnFunction, value_fp32,\n               value_spatial_shapes_int32, value_level_start_index_int32,\n               sampling_locations_fp32, attention_weights_fp32, output);\n\n  at::Tensor real_output = output;\n  if (value.scalar_type() != at::kFloat) {\n    real_output = output.to(value.scalar_type());\n  }\n  return real_output;\n}\n\nREGISTER_NPU_IMPL(ms_deform_attn_impl_forward, ms_deform_attn_forward_npu);\n\nvoid ms_deform_attn_impl_backward(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);\n\nvoid ms_deform_attn_backward_npu(\n    const Tensor &value, const Tensor &spatial_shapes,\n    const Tensor &level_start_index, const Tensor &sampling_loc,\n    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,\n    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,\n    const int im2col_step) {\n  check_support(value, attn_weight);\n  EXEC_NPU_CMD(aclnnMultiScaleDeformableAttentionGrad, value, spatial_shapes,\n               level_start_index, sampling_loc, attn_weight, grad_output,\n               grad_value, grad_sampling_loc, grad_attn_weight);\n}\n\nREGISTER_NPU_IMPL(ms_deform_attn_impl_backward, ms_deform_attn_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\n\nvoid iou3d_nms3d_normal_forward_npu(const Tensor boxes, Tensor &keep,\n                                    Tensor &num_out, float nms_overlap_thresh) {\n  int32_t box_num = boxes.size(0);\n  int32_t data_align = 16;\n  int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align;\n  const double iou_threshold = nms_overlap_thresh;\n  at::Tensor mask =\n      at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort));\n  EXEC_NPU_CMD(aclnnNms3dNormal, boxes, iou_threshold, mask);\n\n  Tensor keep_t = at::zeros({box_num}, mask.options());\n  Tensor num_out_t = at::zeros(1, mask.options());\n  EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t);\n  num_out.fill_(num_out_t.item().toLong());\n  keep.copy_(keep_t);\n}\n\nvoid iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,\n                                     Tensor &num_out, float nms_overlap_thresh);\n\nREGISTER_NPU_IMPL(iou3d_nms3d_normal_forward_impl,\n                  iou3d_nms3d_normal_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nconstexpr int32_t BOX_DIM = 7;\n\nvoid iou3d_nms3d_forward_npu(const Tensor boxes, Tensor &keep, Tensor &num_out,\n                             float nms_overlap_thresh) {\n  TORCH_CHECK((boxes.sizes()[1] == BOX_DIM),\n              \"Input boxes shape should be (N, 7)\");\n  int32_t box_num = boxes.size(0);\n  int32_t data_align = 16;\n  int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align;\n  const double iou_threshold = nms_overlap_thresh;\n  at::Tensor mask =\n      at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort));\n  EXEC_NPU_CMD(aclnnNms3d, boxes, iou_threshold, mask);\n\n  Tensor keep_t = at::zeros({box_num}, mask.options());\n  Tensor num_out_t = at::zeros(1, mask.options());\n  EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t);\n  num_out.fill_(num_out_t.item().toLong());\n  keep.copy_(keep_t);\n}\n\nvoid iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep, Tensor &num_out,\n                              float nms_overlap_thresh);\n\nREGISTER_NPU_IMPL(iou3d_nms3d_forward_impl, iou3d_nms3d_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/nms_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nTensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {\n  TORCH_CHECK((boxes.scalar_type() == at::ScalarType::Float),\n              \"The type of boxes tensor passed in nms_npu should be float\");\n  int64_t offset_64 = offset;\n  at::Tensor iou_threshold_y =\n      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(iou_threshold);\n  at::Tensor scores_threshold_y =\n      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(0);\n  at::Tensor max_outputsize_y =\n      at::empty({}, boxes.options().dtype(at::kInt)).fill_(boxes.size(0));\n\n  c10::SmallVector<int64_t, 8> outputsize = {boxes.size(0)};\n  at::Tensor output =\n      at::empty(outputsize, boxes.options().dtype(at::kInt)).fill_(-1);\n  OpCommand cmd;\n  cmd.Name(\"NonMaxSuppressionV3\")\n      .Input(boxes)\n      .Input(scores)\n      .Input(max_outputsize_y)\n      .Input(iou_threshold_y)\n      .Input(scores_threshold_y)\n      .Attr(\"offset\", offset_64)\n      .Output(output)\n      .Run();\n  auto outputsizeBool = at::gt(output, -1);\n  auto outputsizeInt = outputsizeBool.to(at::kInt);\n  auto countLen = at::sum(outputsizeInt, at::kInt);\n  at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());\n  actual_output = actual_output.to(at::kLong);\n  return actual_output;\n}\n\nTensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);\n\nREGISTER_NPU_IMPL(nms_impl, nms_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\n\nTensor nms_rotated_npu(const Tensor dets, const Tensor scores,\n                       const Tensor labels, const float iou_threshold) {\n  auto originDtype = dets.scalar_type();\n  at::Tensor detsCast = dets;\n  at::Tensor scoresCast = scores;\n  if (originDtype != at::kFloat) {\n    detsCast = detsCast.to(at::kFloat);\n    scoresCast = scoresCast.to(at::kFloat);\n  }\n  c10::SmallVector<int64_t, 8> selectedIndexSize = {dets.size(0)};\n\n  at::Tensor selectedBox = at::empty_like(dets);\n  at::Tensor selectedIndex =\n      at::empty(selectedIndexSize, dets.options().dtype(at::kInt));\n\n  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};\n  OpCommand cmd;\n  cmd.Sync(output_sync_idx)\n      .Name(\"RotatedNMS\")\n      .Input(detsCast)\n      .Input(scoresCast)\n      .Input(labels)\n      .Output(selectedBox)\n      .Output(selectedIndex)\n      .Attr(\"iou_threshold\", (float)iou_threshold)\n      .Attr(\"is_angle\", false)\n      .Run();\n  selectedIndex = selectedIndex.to(at::kLong);\n  return selectedIndex;\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/points_in_box_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid points_in_boxes_part_forward_impl_npu(int batch_size, int boxes_num,\n                                           int pts_num, const Tensor boxes,\n                                           const Tensor pts,\n                                           Tensor box_idx_of_points) {\n  c10::SmallVector<int64_t, 8> output_size = {pts.size(0), pts.size(1)};\n  auto boxes_trans = boxes.transpose(1, 2).contiguous();\n  EXEC_NPU_CMD(aclnnPointsInBox, boxes_trans, pts, box_idx_of_points);\n}\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points);\nREGISTER_NPU_IMPL(points_in_boxes_part_forward_impl,\n                  points_in_boxes_part_forward_impl_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid points_in_boxes_all_forward_impl_npu(int batch_size, int boxes_num,\n                                          int pts_num, const Tensor boxes,\n                                          const Tensor pts,\n                                          Tensor box_idx_of_points) {\n  c10::SmallVector<int64_t, 8> output_size = {pts.size(0), pts.size(1),\n                                              boxes.size(1)};\n  auto boxes_trans = boxes.transpose(1, 2).contiguous();\n  EXEC_NPU_CMD(aclnnPointsInBoxAll, boxes_trans, pts, box_idx_of_points);\n}\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points);\nREGISTER_NPU_IMPL(points_in_boxes_all_forward_impl,\n                  points_in_boxes_all_forward_impl_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nconstexpr int32_t MAX_POLYGONS_BATCH = 2800;\n\nvoid points_in_polygons_npu(const Tensor points, Tensor polygons, Tensor output,\n                            const int rows, const int cols) {\n  TORCH_CHECK(\n      (polygons.sizes()[0] <= MAX_POLYGONS_BATCH),\n      \"The batch of polygons tensor must be less than MAX_POLYGONS_BATCH\");\n  at::Tensor trans_polygons = polygons.transpose(0, 1);\n  OpCommand cmd;\n  at::Tensor new_trans_polygons = trans_polygons.contiguous();\n  cmd.Name(\"PointsInPolygons\")\n      .Input(points, (string) \"points\")\n      .Input(new_trans_polygons, (string) \"polygons\")\n      .Output(output)\n      .Run();\n}\n\nvoid points_in_polygons_forward_impl(const Tensor points, Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols);\n\nREGISTER_NPU_IMPL(points_in_polygons_forward_impl, points_in_polygons_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,\n                         const int num, const int h_feature,\n                         const int w_feature, const int h_mask,\n                         const int w_mask, const int half_h_mask,\n                         const int half_w_mask) {\n  int64_t psa_type_i64 = psa_type;\n  int64_t num_i64 = num;\n  int64_t h_feature_i64 = h_feature;\n  int64_t w_feature_i64 = w_feature;\n  int64_t h_mask_i64 = h_mask;\n  int64_t w_mask_i64 = w_mask;\n  int64_t half_h_mask_i64 = half_h_mask;\n  int64_t half_w_mask_i64 = half_w_mask;\n  OpCommand cmd;\n  cmd.Name(\"PSAMask\")\n      .Input(x)\n      .Output(y)\n      .Attr(\"psa_type\", psa_type_i64)\n      .Attr(\"num\", num_i64)\n      .Attr(\"h_feature\", h_feature_i64)\n      .Attr(\"w_feature\", w_feature_i64)\n      .Attr(\"h_mask\", h_mask_i64)\n      .Attr(\"w_mask\", w_mask_i64)\n      .Attr(\"half_h_mask\", half_h_mask_i64)\n      .Attr(\"half_w_mask\", half_w_mask_i64)\n      .Run();\n}\n\nvoid psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,\n                          const int num, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask);\n\nvoid psamask_backward_npu(const int psa_type, const Tensor y_grad,\n                          Tensor x_grad, const int num, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  int64_t psa_type_i64 = psa_type;\n  int64_t num_i64 = num;\n  int64_t h_feature_i64 = h_feature;\n  int64_t w_feature_i64 = w_feature;\n  int64_t h_mask_i64 = h_mask;\n  int64_t w_mask_i64 = w_mask;\n  int64_t half_h_mask_i64 = half_h_mask;\n  int64_t half_w_mask_i64 = half_w_mask;\n  OpCommand cmd;\n  cmd.Name(\"PSAMaskGrad\")\n      .Input(y_grad)\n      .Output(x_grad)\n      .Attr(\"psa_type\", psa_type_i64)\n      .Attr(\"num\", num_i64)\n      .Attr(\"h_feature\", h_feature_i64)\n      .Attr(\"w_feature\", w_feature_i64)\n      .Attr(\"h_mask\", h_mask_i64)\n      .Attr(\"w_mask\", w_mask_i64)\n      .Attr(\"half_h_mask\", half_h_mask_i64)\n      .Attr(\"half_w_mask\", half_w_mask_i64)\n      .Run();\n}\n\nvoid psamask_backward_impl(const int psa_type, const Tensor y_grad,\n                           Tensor x_grad, const int num, const int h_feature,\n                           const int w_feature, const int h_mask,\n                           const int w_mask, const int half_h_mask,\n                           const int half_w_mask);\n\nREGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);\nREGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid roi_align_forward_npu(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                           int aligned_width, float spatial_scale,\n                           int sampling_ratio, int pool_mode, bool aligned) {\n  TORCH_CHECK(input.scalar_type() == at::kFloat,\n              \"input should be a float tensor\");\n  int64_t roi_end_mode = 2;\n  if (!aligned) {\n    LOG(WARNING) << \"The [aligned] attr in roi_align op is false\";\n    roi_end_mode = 0;\n  }\n  int64_t aligned_height_64 = aligned_height;\n  int64_t aligned_width_64 = aligned_width;\n  int64_t sampling_ratio_64 = sampling_ratio;\n  OpCommand cmd;\n  cmd.Name(\"ROIAlign\")\n      .Input(input)\n      .Input(rois)\n      .Output(output)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"pooled_height\", aligned_height_64)\n      .Attr(\"pooled_width\", aligned_width_64)\n      .Attr(\"sample_num\", sampling_ratio_64)\n      .Attr(\"roi_end_mode\", roi_end_mode)\n      .Run();\n}\n\nvoid roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                            Tensor argmax_x, Tensor grad_input,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  TORCH_CHECK(grad_output.scalar_type() == at::kFloat,\n              \"input should be a float tensor\");\n  int64_t aligned_height_64 = aligned_height;\n  int64_t aligned_width_64 = aligned_width;\n  int64_t sampling_ratio_64 = sampling_ratio;\n  int64_t roi_end_mode = 2;\n  if (!aligned) {\n    LOG(WARNING) << \"The [aligned] attr in roi_align_grad op is false\";\n    roi_end_mode = 0;\n  }\n  auto shape = grad_input.sizes();\n  c10::SmallVector<int64_t, 8> xdiff_shape;\n  for (uint64_t i = 0; i < shape.size(); i++) {\n    xdiff_shape.emplace_back(shape[i]);\n  }\n  OpCommand cmd;\n  cmd.Name(\"ROIAlignGrad\")\n      .Input(grad_output)\n      .Input(rois)\n      .Output(grad_input)\n      .Attr(\"xdiff_shape\", xdiff_shape)\n      .Attr(\"pooled_width\", aligned_width_64)\n      .Attr(\"pooled_height\", aligned_height_64)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"sample_num\", sampling_ratio_64)\n      .Attr(\"roi_end_mode\", roi_end_mode)\n      .Run();\n}\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned);\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned);\n\nREGISTER_NPU_IMPL(roi_align_forward_impl, roi_align_forward_npu);\nREGISTER_NPU_IMPL(roi_align_backward_impl, roi_align_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/roi_align_rotated_npu.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid roi_align_rotated_forward_npu(Tensor input, Tensor rois, Tensor output,\n                                   int aligned_height, int aligned_width,\n                                   float spatial_scale, int sampling_ratio,\n                                   bool aligned, bool clockwise) {\n  int64_t aligned_height_64 = aligned_height;\n  int64_t aligned_width_64 = aligned_width;\n  int64_t sampling_ratio_64 = sampling_ratio;\n\n  at::Tensor input_trans = input.permute({0, 2, 3, 1}).contiguous();\n  at::Tensor rois_trans = rois.permute({1, 0}).contiguous();\n  at::Tensor output_trans = output.permute({0, 2, 3, 1}).contiguous();\n\n  OpCommand cmd;\n  cmd.Name(\"RoiAlignRotated\")\n      .Input(input_trans)\n      .Input(rois_trans)\n      .Output(output_trans)\n      .Attr(\"pooled_h\", aligned_height_64)\n      .Attr(\"pooled_w\", aligned_width_64)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"sampling_ratio\", sampling_ratio_64)\n      .Attr(\"aligned\", aligned)\n      .Attr(\"clockwise\", clockwise)\n      .Run();\n\n  output_trans = output_trans.permute({0, 3, 1, 2}).contiguous();\n  output.copy_(output_trans);\n}\n\nvoid roi_align_rotated_backward_npu(Tensor top_grad, Tensor rois,\n                                    Tensor bottom_grad, int aligned_height,\n                                    int aligned_width, float spatial_scale,\n                                    int sampling_ratio, bool aligned,\n                                    bool clockwise) {\n  int64_t aligned_height_64 = aligned_height;\n  int64_t aligned_width_64 = aligned_width;\n  int64_t sampling_ratio_64 = sampling_ratio;\n\n  at::Tensor top_grad_trans = top_grad.permute({0, 2, 3, 1}).contiguous();\n  at::Tensor rois_trans = rois.permute({1, 0}).contiguous();\n  at::Tensor bottom_grad_trans = bottom_grad.permute({0, 2, 3, 1}).contiguous();\n\n  c10::SmallVector<int64_t, 8> y_grad_shape;\n  auto shape = bottom_grad_trans.sizes();\n  for (uint64_t i = 0; i < shape.size(); i++) {\n    y_grad_shape.emplace_back(shape[i]);\n  }\n  OpCommand cmd;\n  cmd.Name(\"RoiAlignRotatedGrad\")\n      .Input(top_grad_trans)\n      .Input(rois_trans)\n      .Output(bottom_grad_trans)\n      .Attr(\"y_grad_shape\", y_grad_shape)\n      .Attr(\"pooled_h\", aligned_width_64)\n      .Attr(\"pooled_w\", aligned_height_64)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"sampling_ratio\", sampling_ratio_64)\n      .Attr(\"aligned\", aligned)\n      .Attr(\"clockwise\", clockwise)\n      .Run();\n\n  bottom_grad_trans = bottom_grad_trans.permute({0, 3, 1, 2}).contiguous();\n  bottom_grad.copy_(bottom_grad_trans);\n}\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise);\n\nREGISTER_NPU_IMPL(roi_align_rotated_forward_impl,\n                  roi_align_rotated_forward_npu);\nREGISTER_NPU_IMPL(roi_align_rotated_backward_impl,\n                  roi_align_rotated_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,\n                          Tensor argmax, int pooled_height, int pooled_width,\n                          float spatial_scale) {\n  int64_t pooled_height_64 = pooled_height;\n  int64_t pooled_width_64 = pooled_width;\n  int64_t pooled_channel = 1;\n  at::Tensor roi_actual_num =\n      at::empty_like(rois, rois.options().dtype(at::kInt));\n  if (input.sizes()[1] % 16 == 0) {\n    OpCommand cmd;\n    cmd.Name(\"RoiPoolingWithArgMax\")\n        .Input(input)\n        .Input(rois)\n        .Input(roi_actual_num)\n        .Output(output)\n        .Output(argmax)\n        .Attr(\"pooled_h\", pooled_height_64)\n        .Attr(\"pooled_w\", pooled_width_64)\n        .Attr(\"spatial_scale_h\", spatial_scale)\n        .Attr(\"spatial_scale_w\", spatial_scale)\n        .Attr(\"pool_channel\", pooled_channel)\n        .Run();\n\n  } else {\n    OpCommand cmd;\n    cmd.Name(\"RoiPoolingWithArgMax\")\n        .Input(input)\n        .Input(rois)\n        .Input(roi_actual_num)\n        .Output(output)\n        .Output(argmax)\n        .Attr(\"pooled_h\", pooled_height_64)\n        .Attr(\"pooled_w\", pooled_width_64)\n        .Attr(\"spatial_scale_h\", spatial_scale)\n        .Attr(\"spatial_scale_w\", spatial_scale)\n        .Attr(\"pool_channel\", pooled_channel)\n        .Attr(\"_exclude_engines\", (string) \"AiCore\")\n        .Run();\n  }\n}\n\nvoid roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,\n                           Tensor grad_input, int pooled_height,\n                           int pooled_width, float spatial_scale) {\n  int64_t pooled_height_64 = pooled_height;\n  int64_t pooled_width_64 = pooled_width;\n  int64_t pooled_channel = 1;\n  at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3);\n  at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3);\n  at::Tensor roi_actual_num =\n      at::empty_like(rois, rois.options().dtype(at::kInt));\n  at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3);\n  at::Tensor y = at::zeros_like(x);\n  OpCommand cmd;\n  cmd.Name(\"RoiPoolingGradWithArgMax\")\n      .Input(grad_output_trans)\n      .Input(x)\n      .Input(rois)\n      .Input(roi_actual_num)\n      .Input(argmax_trans)\n      .Output(y)\n      .Attr(\"pooled_h\", pooled_height_64)\n      .Attr(\"pooled_w\", pooled_width_64)\n      .Attr(\"spatial_scale_h\", spatial_scale)\n      .Attr(\"spatial_scale_w\", spatial_scale)\n      .Attr(\"pool_channel\", pooled_channel)\n      .Run();\n  at::Tensor result = y.transpose(2, 3).transpose(1, 2);\n  at::Tensor res = result.contiguous();\n  grad_input.copy_(res);\n}\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale);\n\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale);\n\nREGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);\nREGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/roiaware_pool3d_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid roiaware_pool3d_forward_npu(int boxes_num, int pts_num, int channels,\n                                 int max_pts_each_voxel, int out_x, int out_y,\n                                 int out_z, const Tensor rois, const Tensor pts,\n                                 const Tensor pts_feature, Tensor argmax,\n                                 Tensor pts_idx_of_voxels,\n                                 Tensor pooled_features, int pool_method) {\n  at::Tensor rois_cast = rois;\n  at::Tensor pts_cast = pts;\n  at::Tensor pts_feature_cast = pts_feature;\n  at::Tensor pooled_features_cast = pooled_features;\n\n  auto dtype = rois.dtype();\n  if (dtype == at::kHalf) {\n    rois_cast = rois_cast.to(at::kFloat);\n    pts_cast = pts_cast.to(at::kFloat);\n    pts_feature_cast = pts_feature_cast.to(at::kFloat);\n    pooled_features_cast = pooled_features_cast.to(at::kFloat);\n  }\n\n  EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast,\n               pool_method, max_pts_each_voxel, out_x, out_y, out_z, argmax,\n               pts_idx_of_voxels, pooled_features_cast);\n\n  if (dtype == at::kHalf) {\n    pooled_features_cast = pooled_features_cast.to(at::kHalf);\n  }\n\n  pooled_features.copy_(pooled_features_cast);\n}\n\nvoid roiaware_pool3d_backward_npu(int boxes_num, int out_x, int out_y,\n                                  int out_z, int channels,\n                                  int max_pts_each_voxel,\n                                  const Tensor pts_idx_of_voxels,\n                                  const Tensor argmax, const Tensor grad_out,\n                                  Tensor grad_in, int pool_method) {\n  int32_t npoints = grad_in.size(0);\n\n  auto dtype = grad_out.dtype();\n  at::Tensor grad_out_cast = grad_out;\n  at::Tensor grad_in_cast = grad_in;\n\n  if (dtype == at::kHalf) {\n    grad_out_cast = grad_out.to(at::kFloat);\n    grad_in_cast = grad_in_cast.to(at::kFloat);\n  }\n\n  if (pool_method == 0) {\n    // maxpool3d\n    EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num,\n                 out_x, out_y, out_z, channels, npoints, grad_in_cast);\n  } else if (pool_method == 1) {\n    // avgpool3d\n    EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast,\n                 boxes_num, out_x, out_y, out_z, channels, npoints,\n                 max_pts_each_voxel, grad_in_cast);\n  }\n\n  if (dtype == at::kHalf) {\n    grad_in_cast = grad_in_cast.to(at::kHalf);\n  }\n\n  grad_in.copy_(grad_in_cast);\n}\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method);\n\nREGISTER_NPU_IMPL(roiaware_pool3d_forward_impl, roiaware_pool3d_forward_npu);\nREGISTER_NPU_IMPL(roiaware_pool3d_backward_impl, roiaware_pool3d_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/roipoint_pool3d_forward.cpp",
    "content": "#include \"common_util.h\"\n#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid roipoint_pool3d_forward_impl_npu(int batch_size, int pts_num,\n                                      int boxes_num, int feature_in_len,\n                                      int sampled_pts_num, const Tensor xyz,\n                                      const Tensor boxes3d,\n                                      const Tensor pts_feature,\n                                      Tensor pooled_features,\n                                      Tensor pooled_empty_flag) {\n  auto points_trans = xyz.transpose(1, 2).contiguous();\n  auto point_features_trans = pts_feature.transpose(1, 2).contiguous();\n  c10::SmallVector<int64_t, 8> features_trans_size = {\n      xyz.size(0), boxes3d.size(1), xyz.size(2) + pts_feature.size(2),\n      sampled_pts_num};\n  at::Tensor pooled_features_trans =\n      at::empty(features_trans_size, xyz.options());\n  c10::SmallVector<int64_t, 8> empty_flag_size = {boxes3d.size(0),\n                                                  boxes3d.size(1)};\n  EXEC_NPU_CMD(aclnnRoipointPool3dForward, points_trans, point_features_trans,\n               boxes3d, sampled_pts_num, pooled_features_trans,\n               pooled_empty_flag);\n  auto pooled_features_cache =\n      pooled_features_trans.transpose(2, 3).contiguous();\n  pooled_features.copy_(pooled_features_cache);\n}\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag);\n\nREGISTER_NPU_IMPL(roipoint_pool3d_forward_impl,\n                  roipoint_pool3d_forward_impl_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/rotated_feature_align_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output);\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad);\n\nvoid rotated_feature_align_forward_npu(const Tensor features,\n                                       const Tensor best_bboxes,\n                                       const float spatial_scale,\n                                       const int points, Tensor output) {\n  int64_t points_ = (int64_t)points;\n  at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);\n  OpCommand cmd;\n  cmd.Name(\"RotatedFeatureAlign\")\n      .Input(features)\n      .Input(best_bboxes_)\n      .Output(output)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"points\", points_)\n      .Run();\n}\n\nvoid rotated_feature_align_backward_npu(const Tensor top_grad,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor bottom_grad) {\n  int64_t points_ = (int64_t)points;\n  at::Tensor best_bboxes_ = best_bboxes.transpose(2, 3).transpose(1, 2);\n  OpCommand cmd;\n  cmd.Name(\"RotatedFeatureAlignGrad\")\n      .Input(top_grad)\n      .Input(best_bboxes_)\n      .Output(bottom_grad)\n      .Attr(\"spatial_scale\", spatial_scale)\n      .Attr(\"points\", points_)\n      .Run();\n}\n\nREGISTER_NPU_IMPL(rotated_feature_align_forward_impl,\n                  rotated_feature_align_forward_npu);\n\nREGISTER_NPU_IMPL(rotated_feature_align_backward_impl,\n                  rotated_feature_align_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid stack_ball_query_forward_npu(float max_radius, int nsample,\n                                  const Tensor new_xyz,\n                                  const Tensor new_xyz_batch_cnt,\n                                  const Tensor xyz, const Tensor xyz_batch_cnt,\n                                  Tensor idx) {\n  at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();\n  double max_radius_double = double(max_radius);\n  EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,\n               new_xyz_batch_cnt, max_radius_double, nsample, idx);\n}\n\nvoid stack_ball_query_forward_impl(float max_radius, int nsample,\n                                   const Tensor new_xyz,\n                                   const Tensor new_xyz_batch_cnt,\n                                   const Tensor xyz, const Tensor xyz_batch_cnt,\n                                   Tensor idx);\n\nREGISTER_NPU_IMPL(stack_ball_query_forward_impl, stack_ball_query_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/stack_group_points_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid stack_group_points_forward_npu(int b, int c, int n, int nsample,\n                                    const Tensor features_tensor,\n                                    const Tensor features_batch_cnt_tensor,\n                                    const Tensor idx_tensor,\n                                    const Tensor idx_batch_cnt_tensor,\n                                    Tensor out_tensor) {\n  EXEC_NPU_CMD(aclnnStackGroupPoints, features_tensor,\n               features_batch_cnt_tensor, idx_tensor, idx_batch_cnt_tensor,\n               out_tensor);\n}\n\nvoid stack_group_points_forward_impl(int b, int c, int n, int nsample,\n                                     const Tensor features_tensor,\n                                     const Tensor features_batch_cnt_tensor,\n                                     const Tensor idx_tensor,\n                                     const Tensor idx_batch_cnt_tensor,\n                                     Tensor out_tensor);\n\nREGISTER_NPU_IMPL(stack_group_points_forward_impl,\n                  stack_group_points_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n#include \"torch_npu/csrc/aten/NPUNativeFunctions.h\"\n#include \"torch_npu/csrc/framework/utils/OpAdapter.h\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid three_interpolate_forward_npu(int b, int c, int m, int n,\n                                   const Tensor points, const Tensor idx,\n                                   const Tensor weight, Tensor out) {\n  auto originDtype = points.scalar_type();\n  TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),\n              \"three_interpolate_forward ascend only support fp32 and fp16.\");\n\n  auto point_c_trans = points.transpose(1, 2);\n\n  OpCommand cmd;\n  cmd.Name(\"ThreeInterpolate\")\n      .Input(point_c_trans)\n      .Input(idx)\n      .Input(weight)\n      .Output(out)\n      .Run();\n\n  auto output = out.view({b, n, c}).transpose(1, 2);\n  auto res = output.contiguous();\n  out.copy_(res);\n}\n\nvoid three_interpolate_backward_npu(int b, int c, int n, int m,\n                                    const Tensor grad_out, const Tensor idx,\n                                    const Tensor weight, Tensor grad_points) {\n  auto originDtype = grad_out.scalar_type();\n  TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),\n              \"three_interpolate_backward ascend only support fp32 and fp16.\");\n\n  auto grad_x = at::unsqueeze(grad_out, 3);\n  auto grad_y = at::unsqueeze(grad_points, 3);\n\n  EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y);\n\n  auto output = at::squeeze(grad_y, 3);\n  auto res = output.contiguous();\n  grad_points.copy_(res);\n}\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out);\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points);\n\nREGISTER_NPU_IMPL(three_interpolate_forward_impl,\n                  three_interpolate_forward_npu);\n\nREGISTER_NPU_IMPL(three_interpolate_backward_impl,\n                  three_interpolate_backward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n#include \"torch_npu/csrc/aten/NPUNativeFunctions.h\"\n#include \"torch_npu/csrc/framework/utils/OpAdapter.h\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nvoid three_nn_forward_npu(int b, int n, int m, const Tensor unknown,\n                          const Tensor known, Tensor dist2, Tensor idx) {\n  at::Tensor source = known.contiguous();\n  at::Tensor target = unknown.contiguous();\n\n  bool is_from_knn = false;\n  int nsample = 3;\n  EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx);\n}\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx);\n\nREGISTER_NPU_IMPL(three_nn_forward_impl, three_nn_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp",
    "content": "#include \"pytorch_npu_helper.hpp\"\n\nusing namespace NPU_NAME_SPACE;\nusing namespace std;\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim = 3);\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim = 3);\n\nint hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,\n                              at::Tensor &coors,\n                              at::Tensor &num_points_per_voxel,\n                              const std::vector<float> voxel_size,\n                              const std::vector<float> coors_range,\n                              const int max_points, const int max_voxels,\n                              const int NDim = 3) {\n  at::Tensor voxel_num_tmp = at::empty({1}, points.options());\n  at::Tensor voxel_num = voxel_num_tmp.to(at::kInt);\n\n  at::Tensor voxel_size_cpu = at::from_blob(\n      const_cast<float *>(voxel_size.data()), {3}, dtype(at::kFloat));\n  at::Tensor voxel_size_npu = voxel_size_cpu.to(points.device());\n\n  at::Tensor coors_range_cpu = at::from_blob(\n      const_cast<float *>(coors_range.data()), {6}, dtype(at::kFloat));\n  at::Tensor coors_range_npu = coors_range_cpu.to(points.device());\n\n  int64_t max_points_ = (int64_t)max_points;\n  int64_t max_voxels_ = (int64_t)max_voxels;\n\n  // only support true now\n  bool deterministic = true;\n\n  OpCommand cmd;\n  cmd.Name(\"Voxelization\")\n      .Input(points)\n      .Input(voxel_size_npu)\n      .Input(coors_range_npu)\n      .Output(voxels)\n      .Output(coors)\n      .Output(num_points_per_voxel)\n      .Output(voxel_num)\n      .Attr(\"max_points\", max_points_)\n      .Attr(\"max_voxels\", max_voxels_)\n      .Attr(\"deterministic\", deterministic)\n      .Run();\n  auto voxel_num_cpu = voxel_num.to(at::kCPU);\n  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];\n  return voxel_num_int;\n}\n\nvoid dynamic_voxelize_forward_npu(const at::Tensor &points, at::Tensor &coors,\n                                  const std::vector<float> voxel_size,\n                                  const std::vector<float> coors_range,\n                                  const int NDim = 3) {\n  uint32_t ptsNum = points.size(0);\n  uint32_t ptsFeature = points.size(1);\n  at::Tensor ptsTrans = at::transpose(points, 0, 1);\n  double coors_min_x = coors_range[0];\n  double coors_min_y = coors_range[1];\n  double coors_min_z = coors_range[2];\n  double coors_max_x = coors_range[3];\n  double coors_max_y = coors_range[4];\n  double coors_max_z = coors_range[5];\n  double voxel_x = voxel_size[0];\n  double voxel_y = voxel_size[1];\n  double voxel_z = voxel_size[2];\n  int grid_x = std::round((coors_max_x - coors_min_x) / voxel_x);\n  int grid_y = std::round((coors_max_y - coors_min_y) / voxel_y);\n  int grid_z = std::round((coors_max_z - coors_min_z) / voxel_z);\n\n  at::Tensor tmp_coors =\n      at::zeros({3, ptsNum}, points.options().dtype(at::kInt));\n  EXEC_NPU_CMD(aclnnDynamicVoxelization, ptsTrans, coors_min_x, coors_min_y,\n               coors_min_z, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z,\n               tmp_coors);\n  tmp_coors.transpose_(0, 1);\n  coors.copy_(tmp_coors);\n}\n\nREGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);\nREGISTER_NPU_IMPL(dynamic_voxelize_forward_impl, dynamic_voxelize_forward_npu);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/pixel_group.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// It is modified from https://github.com/WenmuZhou/PAN.pytorch\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nstd::vector<std::vector<float>> pixel_group_impl(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {\n  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,\n                              kernel_label, kernel_contour, kernel_region_num,\n                              dis_threshold);\n}\n\nstd::vector<std::vector<float>> pixel_group(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {\n  score = score.contiguous();\n  mask = mask.contiguous();\n  embedding = embedding.contiguous();\n  kernel_label = kernel_label.contiguous();\n  kernel_contour = kernel_contour.contiguous();\n\n  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,\n                          kernel_region_num, distance_threshold);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/points_in_boxes.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid points_in_boxes_part_forward_impl(int batch_size, int boxes_num,\n                                       int pts_num, const Tensor boxes,\n                                       const Tensor pts,\n                                       Tensor box_idx_of_points) {\n  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,\n                       pts_num, boxes, pts, box_idx_of_points);\n}\n\nvoid points_in_boxes_all_forward_impl(int batch_size, int boxes_num,\n                                      int pts_num, const Tensor boxes,\n                                      const Tensor pts,\n                                      Tensor box_idx_of_points) {\n  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,\n                       pts_num, boxes, pts, box_idx_of_points);\n}\n\nvoid points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                  Tensor box_idx_of_points_tensor) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)\n  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),\n  // default -1\n  int batch_size = boxes_tensor.size(0);\n  int boxes_num = boxes_tensor.size(1);\n  int pts_num = pts_tensor.size(1);\n  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,\n                                    boxes_tensor, pts_tensor,\n                                    box_idx_of_points_tensor);\n}\n\nvoid points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor box_idx_of_points_tensor) {\n  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR\n  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]\n  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1\n  int batch_size = boxes_tensor.size(0);\n  int boxes_num = boxes_tensor.size(1);\n  int pts_num = pts_tensor.size(1);\n  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,\n                                   pts_tensor, box_idx_of_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/points_in_polygons.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,\n                                     Tensor output, const int rows,\n                                     const int cols) {\n  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,\n                       output, rows, cols);\n}\n\nvoid points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {\n  int rows = points.size(0);\n  int cols = polygons.size(0);\n  points_in_polygons_forward_impl(points, polygons, output, rows, cols);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/prroi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                             int pooled_height, int pooled_width,\n                             float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_backward_impl(Tensor grad_output, Tensor rois,\n                              Tensor grad_input, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,\n                                   Tensor input, Tensor rois, Tensor grad_rois,\n                                   int pooled_height, int pooled_width,\n                                   float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,\n                       input, rois, grad_rois, pooled_height, pooled_width,\n                       spatial_scale);\n}\n\nvoid prroi_pool_forward(Tensor input, Tensor rois, Tensor output,\n                        int pooled_height, int pooled_width,\n                        float spatial_scale) {\n  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,\n                          spatial_scale);\n}\n\nvoid prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                         int pooled_height, int pooled_width,\n                         float spatial_scale) {\n  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,\n                           pooled_width, spatial_scale);\n}\n\nvoid prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,\n                              Tensor rois, Tensor grad_rois, int pooled_height,\n                              int pooled_width, float spatial_scale) {\n  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,\n                                pooled_height, pooled_width, spatial_scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/psamask.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n// Modified from\n// https://github.com/hszhao/semseg/blob/master/lib/psa/src\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,\n                          const int num_, const int h_feature,\n                          const int w_feature, const int h_mask,\n                          const int w_mask, const int half_h_mask,\n                          const int half_w_mask) {\n  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,\n                       h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                       half_w_mask);\n}\n\nvoid psamask_backward_impl(const int psa_type, const Tensor grad_output,\n                           Tensor grad_input, const int num_,\n                           const int h_feature, const int w_feature,\n                           const int h_mask, const int w_mask,\n                           const int half_h_mask, const int half_w_mask) {\n  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,\n                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,\n                       half_w_mask);\n}\n\nvoid psamask_forward(const Tensor input, Tensor output, const int psa_type,\n                     const int num_, const int h_feature, const int w_feature,\n                     const int h_mask, const int w_mask, const int half_h_mask,\n                     const int half_w_mask) {\n  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,\n                       h_mask, w_mask, half_h_mask, half_w_mask);\n}\n\nvoid psamask_backward(Tensor grad_output, const Tensor grad_input,\n                      const int psa_type, const int num_, const int h_feature,\n                      const int w_feature, const int h_mask, const int w_mask,\n                      const int half_h_mask, const int half_w_mask) {\n  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,\n                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/pybind.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include <torch/extension.h>\n\n#include \"pytorch_cpp_helper.hpp\"\n\nstd::string get_compiler_version();\nstd::string get_compiling_cuda_version();\n\nvoid assign_score_withk_forward(const Tensor &points, const Tensor &centers,\n                                const Tensor &scores, const Tensor &knn_idx,\n                                Tensor &output, int B, int N0, int N1, int M,\n                                int K, int O, int aggregate);\n\nvoid assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,\n                                 const Tensor &centers, const Tensor &scores,\n                                 const Tensor &knn_idx, Tensor &grad_points,\n                                 Tensor &grad_centers, Tensor &grad_scores,\n                                 int B, int N0, int N1, int M, int K, int O,\n                                 int aggregate);\n\nvoid carafe_naive_forward(Tensor features, Tensor masks, Tensor output,\n                          int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,\n                           Tensor bottom_grad, Tensor mask_grad,\n                           int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,\n                    Tensor routput, Tensor rmasks, Tensor output,\n                    int kernel_size, int group_size, int scale_factor);\n\nvoid carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,\n                     Tensor rtop_grad, Tensor rbottom_grad_hs,\n                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,\n                     Tensor mask_grad, int kernel_size, int group_size,\n                     int scale_factor);\n\nvoid deform_conv_forward(Tensor input, Tensor weight, Tensor offset,\n                         Tensor output, Tensor columns, Tensor ones, int kW,\n                         int kH, int dW, int dH, int padW, int padH,\n                         int dilationW, int dilationH, int group,\n                         int deformable_group, int im2col_step);\n\nvoid deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,\n                                Tensor gradInput, Tensor gradOffset,\n                                Tensor weight, Tensor columns, int kW, int kH,\n                                int dW, int dH, int padW, int padH,\n                                int dilationW, int dilationH, int group,\n                                int deformable_group, int im2col_step);\n\nvoid deform_conv_backward_parameters(Tensor input, Tensor offset,\n                                     Tensor gradOutput, Tensor gradWeight,\n                                     Tensor columns, Tensor ones, int kW,\n                                     int kH, int dW, int dH, int padW, int padH,\n                                     int dilationW, int dilationH, int group,\n                                     int deformable_group, float scale,\n                                     int im2col_step);\n\nvoid deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,\n                             Tensor output, int pooled_height, int pooled_width,\n                             float spatial_scale, int sampling_ratio,\n                             float gamma);\n\nvoid deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,\n                              Tensor offset, Tensor grad_input,\n                              Tensor grad_offset, int pooled_height,\n                              int pooled_width, float spatial_scale,\n                              int sampling_ratio, float gamma);\n\nvoid group_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                          Tensor out_tensor, int b, int c, int n, int npoints,\n                          int nsample);\n\nvoid group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                           Tensor grad_points_tensor, int b, int c, int n,\n                           int npoints, int nsample);\n\nvoid stack_group_points_forward(Tensor features_tensor,\n                                Tensor features_batch_cnt_tensor,\n                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,\n                                Tensor out_tensor, int b, int c, int m,\n                                int nsample);\n\nvoid stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                 Tensor idx_batch_cnt_tensor,\n                                 Tensor features_batch_cnt_tensor,\n                                 Tensor grad_features_tensor, int b, int c,\n                                 int m, int n, int nsample);\n\nvoid roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,\n                             Tensor pooled_features, Tensor pooled_empty_flag);\n\nvoid gather_points_forward(Tensor points_tensor, Tensor idx_tensor,\n                           Tensor out_tensor, int b, int c, int n, int npoints);\n\nvoid gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                            Tensor grad_points_tensor, int b, int c, int n,\n                            int npoints);\n\nvoid sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha);\n\nvoid sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor grad_input, float gamma, float alpha);\n\nvoid softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,\n                                Tensor output, float gamma, float alpha);\n\nvoid softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,\n                                 Tensor buff, Tensor grad_input, float gamma,\n                                 float alpha);\n\nvoid three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,\n                               Tensor weight_tensor, Tensor out_tensor, int b,\n                               int c, int m, int n);\n\nvoid three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                Tensor weight_tensor, Tensor grad_points_tensor,\n                                int b, int c, int n, int m);\n\nvoid three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,\n                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,\n                      int m);\n\nvoid bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,\n                   const int mode, const bool aligned, const int offset);\n\nvoid knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,\n                 Tensor dist2_tensor, int b, int n, int m, int nsample);\n\nvoid iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,\n                                     Tensor ans_overlap);\n\nvoid iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                         float nms_overlap_thresh);\n\nvoid iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,\n                                float nms_overlap_thresh);\n\nvoid furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,\n                                     Tensor idx_tensor, int b, int n, int m);\n\nvoid furthest_point_sampling_with_dist_forward(Tensor points_tensor,\n                                               Tensor temp_tensor,\n                                               Tensor idx_tensor, int b, int n,\n                                               int m);\n\nvoid masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor col,\n                           const int kernel_h, const int kernel_w,\n                           const int pad_h, const int pad_w);\n\nvoid masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,\n                           const Tensor mask_w_idx, Tensor im, int height,\n                           int width, int channels);\n\nvoid modulated_deform_conv_forward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,\n    const int stride_h, const int stride_w, const int pad_h, const int pad_w,\n    const int dilation_h, const int dilation_w, const int group,\n    const int deformable_group, const bool with_bias);\n\nvoid modulated_deform_conv_backward(\n    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,\n    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,\n    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,\n    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,\n    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,\n    const bool with_bias);\n\nTensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,\n                              const Tensor &level_start_index,\n                              const Tensor &sampling_loc,\n                              const Tensor &attn_weight, const int im2col_step);\n\nvoid ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,\n                             const Tensor &level_start_index,\n                             const Tensor &sampling_loc,\n                             const Tensor &attn_weight,\n                             const Tensor &grad_output, Tensor &grad_value,\n                             Tensor &grad_sampling_loc,\n                             Tensor &grad_attn_weight, const int im2col_step);\n\nTensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);\n\nTensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,\n               float sigma, float min_score, int method, int offset);\n\nstd::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);\n\nstd::vector<std::vector<float>> pixel_group(\n    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,\n    Tensor kernel_contour, int kernel_region_num, float distance_threshold);\n\nstd::vector<std::vector<int>> contour_expand(Tensor kernel_mask,\n                                             Tensor internal_kernel_label,\n                                             int min_kernel_area,\n                                             int kernel_num);\n\nvoid roi_align_forward(Tensor input, Tensor rois, Tensor output,\n                       Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                       int aligned_width, float spatial_scale,\n                       int sampling_ratio, int pool_mode, bool aligned);\n\nvoid roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                        Tensor argmax_x, Tensor grad_input, int aligned_height,\n                        int aligned_width, float spatial_scale,\n                        int sampling_ratio, int pool_mode, bool aligned);\n\nvoid roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,\n                      int pooled_height, int pooled_width, float spatial_scale);\n\nvoid roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,\n                       Tensor grad_input, int pooled_height, int pooled_width,\n                       float spatial_scale);\n\nvoid sync_bn_forward_mean(const Tensor input, Tensor mean);\n\nvoid sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);\n\nvoid sync_bn_forward_output(const Tensor input, const Tensor mean,\n                            const Tensor var, const Tensor weight,\n                            const Tensor bias, Tensor running_mean,\n                            Tensor running_var, Tensor norm, Tensor std,\n                            Tensor output, float eps, float momentum,\n                            int group_size);\n\nvoid sync_bn_backward_param(const Tensor grad_output, const Tensor norm,\n                            Tensor grad_weight, Tensor grad_bias);\n\nvoid sync_bn_backward_data(const Tensor grad_output, const Tensor weight,\n                           const Tensor grad_weight, const Tensor grad_bias,\n                           const Tensor norm, const Tensor std,\n                           Tensor grad_input);\n\nvoid psamask_forward(const Tensor input, Tensor output, const int psa_type,\n                     const int num_, const int h_feature, const int w_feature,\n                     const int h_mask, const int w_mask, const int half_h_mask,\n                     const int half_w_mask);\n\nvoid psamask_backward(Tensor grad_output, const Tensor grad_input,\n                      const int psa_type, const int num_, const int h_feature,\n                      const int w_feature, const int h_mask, const int w_mask,\n                      const int half_h_mask, const int half_w_mask);\n\nvoid tin_shift_forward(Tensor input, Tensor shift, Tensor output);\n\nvoid tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);\n\nvoid ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,\n                        Tensor idx_tensor, int b, int n, int m,\n                        float min_radius, float max_radius, int nsample);\n\nvoid stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,\n                              Tensor xyz_tensor, Tensor xyz_batch_cnt,\n                              Tensor idx_tensor, float max_radius, int nsample);\n\nvoid prroi_pool_forward(Tensor input, Tensor rois, Tensor output,\n                        int pooled_height, int pooled_width,\n                        float spatial_scale);\n\nvoid prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                         int pooled_height, int pooled_width,\n                         float spatial_scale);\n\nvoid prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,\n                              Tensor rois, Tensor grad_rois, int pooled_height,\n                              int pooled_width, float spatial_scale);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_forward(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<Tensor> get_indice_pairs_backward(\n    Tensor indices, Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\nTensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,\n                           Tensor indiceNum, int64_t numActOut,\n                           int64_t _inverse, int64_t _subM);\n\nstd::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,\n                                         Tensor outGrad, Tensor indicePairs,\n                                         Tensor indiceNum, int64_t _inverse,\n                                         int64_t _subM);\n\nTensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,\n                                           Tensor bias, Tensor indicePairs,\n                                           Tensor indiceNum, int64_t numActOut,\n                                           int64_t _inverse, int64_t _subM);\n\nTensor indice_maxpool_forward(Tensor features, Tensor indicePairs,\n                              Tensor indiceNum, int64_t numAct);\n\nTensor indice_maxpool_backward(Tensor features, Tensor outFeatures,\n                               Tensor outGrad, Tensor indicePairs,\n                               Tensor indiceNum);\n\nvoid box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                     const int mode_flag, const bool aligned);\n\nTensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,\n                   const Tensor dets_sorted, const Tensor labels,\n                   const float iou_threshold, const int multi_label);\n\nTensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx, int upy,\n                 int downx, int downy, int padx0, int padx1, int pady0,\n                 int pady1, bool flip, float gain);\n\nTensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,\n                            const Tensor &refer, int act, int grad, float alpha,\n                            float scale);\n\nvoid roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,\n                               int pooled_height, int pooled_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned, bool clockwise);\n\nvoid roi_align_rotated_backward(Tensor grad_output, Tensor rois,\n                                Tensor grad_input, int pooled_height,\n                                int pooled_width, float spatial_scale,\n                                int sampling_ratio, bool aligned,\n                                bool clockwise);\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const std::string &reduce_type);\n\nvoid dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,\n                                     const torch::Tensor &grad_reduced_feats,\n                                     const torch::Tensor &feats,\n                                     const torch::Tensor &reduced_feats,\n                                     const torch::Tensor &coors_idx,\n                                     const torch::Tensor &reduce_count,\n                                     const std::string &reduce_type);\n\nvoid hard_voxelize_forward(const at::Tensor &points,\n                           const at::Tensor &voxel_size,\n                           const at::Tensor &coors_range, at::Tensor &voxels,\n                           at::Tensor &coors, at::Tensor &num_points_per_voxel,\n                           at::Tensor &voxel_num, const int max_points,\n                           const int max_voxels, const int NDim,\n                           const bool deterministic);\n\nvoid dynamic_voxelize_forward(const at::Tensor &points,\n                              const at::Tensor &voxel_size,\n                              const at::Tensor &coors_range, at::Tensor &coors,\n                              const int NDim);\n\nvoid border_align_forward(const Tensor &input, const Tensor &boxes,\n                          Tensor output, Tensor argmax_idx,\n                          const int pool_size);\n\nvoid border_align_backward(const Tensor &grad_output, const Tensor &boxes,\n                           const Tensor &argmax_idx, Tensor grad_input,\n                           const int pool_size);\n\nvoid points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor pts_indices_tensor);\n\nvoid points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                  Tensor box_idx_of_points_tensor);\n\nvoid points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,\n                                 Tensor box_idx_of_points_tensor);\n\nvoid roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,\n                             Tensor argmax, Tensor pts_idx_of_voxels,\n                             Tensor pooled_features, int pool_method);\n\nvoid roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,\n                              Tensor grad_out, Tensor grad_in, int pool_method);\n\nvoid correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,\n                         int kW, int patchH, int patchW, int padH, int padW,\n                         int dilationH, int dilationW, int dilation_patchH,\n                         int dilation_patchW, int dH, int dW);\n\nvoid correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,\n                          Tensor grad_input1, Tensor grad_input2, int kH,\n                          int kW, int patchH, int patchW, int padH, int padW,\n                          int dilationH, int dilationW, int dilation_patchH,\n                          int dilation_patchW, int dH, int dW);\n\nvoid rotated_feature_align_forward(const Tensor features,\n                                   const Tensor best_bboxes, Tensor output,\n                                   const float spatial_scale, const int points);\n\nvoid rotated_feature_align_backward(const Tensor top_grad,\n                                    const Tensor best_bboxes,\n                                    Tensor bottom_grad,\n                                    const float spatial_scale,\n                                    const int points);\n\nvoid riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,\n                                 int pooled_height, int pooled_width,\n                                 float spatial_scale, int num_samples,\n                                 int num_orientations, bool clockwise);\n\nvoid riroi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                  Tensor bottom_grad, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int num_samples, int num_orientations,\n                                  bool clockwise);\n\nvoid points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);\n\nvoid min_area_polygons(const Tensor pointsets, Tensor polygons);\n\nvoid active_rotated_filter_forward(const Tensor input, const Tensor indices,\n                                   Tensor output);\n\nvoid active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,\n                                    Tensor grad_in);\n\nvoid convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);\n\nvoid convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);\n\nat::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,\n                                                  at::Tensor mask,\n                                                  at::Tensor num_valid);\n\nvoid chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,\n                              const Tensor dist1, const Tensor dist2,\n                              const Tensor idx1, const Tensor idx);\n\nvoid chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,\n                               Tensor idx1, Tensor idx2, Tensor graddist1,\n                               Tensor graddist2, Tensor gradxyz1,\n                               Tensor gradxyz2);\n\nTensor bias_act(const Tensor &input, const Tensor &bias, const Tensor &xref,\n                const Tensor &yref, const Tensor &dy, int grad, int dim,\n                int act, float alpha, float gain, float clamp);\n\nstd::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(\n    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,\n    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,\n    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,\n    bool writeSigns);\n\ntorch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,\n                                  int sy, float gain, float slope, float clamp,\n                                  bool writeSigns);\n\nvoid box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,\n                    const int mode_flag, const bool aligned);\n\nTensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,\n                  const Tensor dets_sorted, const float iou_threshold,\n                  const int multi_label);\n\nvoid bezier_align_forward(Tensor input, Tensor rois, Tensor output,\n                          int aligned_height, int aligned_width,\n                          float spatial_scale, int sampling_ratio,\n                          bool aligned);\n\nvoid bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,\n                           int aligned_height, int aligned_width,\n                           float spatial_scale, int sampling_ratio,\n                           bool aligned);\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"upfirdn2d\", &upfirdn2d, \"upfirdn2d (CUDA)\", py::arg(\"input\"),\n        py::arg(\"filter\"), py::arg(\"upx\"), py::arg(\"upy\"), py::arg(\"downx\"),\n        py::arg(\"downy\"), py::arg(\"padx0\"), py::arg(\"padx1\"), py::arg(\"pady0\"),\n        py::arg(\"pady1\"), py::arg(\"flip\"), py::arg(\"gain\"));\n  m.def(\"fused_bias_leakyrelu\", &fused_bias_leakyrelu,\n        \"fused_bias_leakyrelu (CUDA)\", py::arg(\"input\"), py::arg(\"bias\"),\n        py::arg(\"empty\"), py::arg(\"act\"), py::arg(\"grad\"), py::arg(\"alpha\"),\n        py::arg(\"scale\"));\n  m.def(\"gather_points_forward\", &gather_points_forward,\n        \"gather_points_forward\", py::arg(\"points_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"out_tensor\"), py::arg(\"b\"),\n        py::arg(\"c\"), py::arg(\"n\"), py::arg(\"npoints\"));\n  m.def(\"gather_points_backward\", &gather_points_backward,\n        \"gather_points_backward\", py::arg(\"grad_out_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"grad_points_tensor\"), py::arg(\"b\"),\n        py::arg(\"c\"), py::arg(\"n\"), py::arg(\"npoints\"));\n  m.def(\"get_compiler_version\", &get_compiler_version, \"get_compiler_version\");\n  m.def(\"get_compiling_cuda_version\", &get_compiling_cuda_version,\n        \"get_compiling_cuda_version\");\n  m.def(\"assign_score_withk_forward\", &assign_score_withk_forward,\n        \"assign_score_withk_forward\", py::arg(\"points\"), py::arg(\"centers\"),\n        py::arg(\"scores\"), py::arg(\"knn_idx\"), py::arg(\"output\"), py::arg(\"B\"),\n        py::arg(\"N0\"), py::arg(\"N1\"), py::arg(\"M\"), py::arg(\"K\"), py::arg(\"O\"),\n        py::arg(\"aggregate\"));\n  m.def(\"assign_score_withk_backward\", &assign_score_withk_backward,\n        \"assign_score_withk_backward\", py::arg(\"grad_out\"), py::arg(\"points\"),\n        py::arg(\"centers\"), py::arg(\"scores\"), py::arg(\"knn_idx\"),\n        py::arg(\"grad_points\"), py::arg(\"grad_centers\"), py::arg(\"grad_scores\"),\n        py::arg(\"B\"), py::arg(\"N0\"), py::arg(\"N1\"), py::arg(\"M\"), py::arg(\"K\"),\n        py::arg(\"O\"), py::arg(\"aggregate\"));\n  m.def(\"knn_forward\", &knn_forward, \"knn_forward\", py::arg(\"xyz_tensor\"),\n        py::arg(\"new_xyz_tensor\"), py::arg(\"idx_tensor\"),\n        py::arg(\"dist2_tensor\"), py::arg(\"b\"), py::arg(\"n\"), py::arg(\"m\"),\n        py::arg(\"nsample\"));\n  m.def(\"carafe_naive_forward\", &carafe_naive_forward, \"carafe_naive_forward\",\n        py::arg(\"features\"), py::arg(\"masks\"), py::arg(\"output\"),\n        py::arg(\"kernel_size\"), py::arg(\"group_size\"), py::arg(\"scale_factor\"));\n  m.def(\"carafe_naive_backward\", &carafe_naive_backward,\n        \"carafe_naive_backward\", py::arg(\"top_grad\"), py::arg(\"features\"),\n        py::arg(\"masks\"), py::arg(\"bottom_grad\"), py::arg(\"mask_grad\"),\n        py::arg(\"kernel_size\"), py::arg(\"group_size\"), py::arg(\"scale_factor\"));\n  m.def(\"carafe_forward\", &carafe_forward, \"carafe_forward\",\n        py::arg(\"features\"), py::arg(\"masks\"), py::arg(\"rfeatures\"),\n        py::arg(\"routput\"), py::arg(\"rmasks\"), py::arg(\"output\"),\n        py::arg(\"kernel_size\"), py::arg(\"group_size\"), py::arg(\"scale_factor\"));\n  m.def(\"carafe_backward\", &carafe_backward, \"carafe_backward\",\n        py::arg(\"top_grad\"), py::arg(\"rfeatures\"), py::arg(\"masks\"),\n        py::arg(\"rtop_grad\"), py::arg(\"rbottom_grad_hs\"),\n        py::arg(\"rbottom_grad\"), py::arg(\"rmask_grad\"), py::arg(\"bottom_grad\"),\n        py::arg(\"mask_grad\"), py::arg(\"kernel_size\"), py::arg(\"group_size\"),\n        py::arg(\"scale_factor\"));\n  m.def(\"deform_conv_forward\", &deform_conv_forward, \"deform_conv_forward\",\n        py::arg(\"input\"), py::arg(\"weight\"), py::arg(\"offset\"),\n        py::arg(\"output\"), py::arg(\"columns\"), py::arg(\"ones\"), py::arg(\"kW\"),\n        py::arg(\"kH\"), py::arg(\"dW\"), py::arg(\"dH\"), py::arg(\"padW\"),\n        py::arg(\"padH\"), py::arg(\"dilationW\"), py::arg(\"dilationH\"),\n        py::arg(\"group\"), py::arg(\"deformable_group\"), py::arg(\"im2col_step\"));\n  m.def(\"deform_conv_backward_input\", &deform_conv_backward_input,\n        \"deform_conv_backward_input\", py::arg(\"input\"), py::arg(\"offset\"),\n        py::arg(\"gradOutput\"), py::arg(\"gradInput\"), py::arg(\"gradOffset\"),\n        py::arg(\"weight\"), py::arg(\"columns\"), py::arg(\"kW\"), py::arg(\"kH\"),\n        py::arg(\"dW\"), py::arg(\"dH\"), py::arg(\"padW\"), py::arg(\"padH\"),\n        py::arg(\"dilationW\"), py::arg(\"dilationH\"), py::arg(\"group\"),\n        py::arg(\"deformable_group\"), py::arg(\"im2col_step\"));\n  m.def(\"deform_conv_backward_parameters\", &deform_conv_backward_parameters,\n        \"deform_conv_backward_parameters\", py::arg(\"input\"), py::arg(\"offset\"),\n        py::arg(\"gradOutput\"), py::arg(\"gradWeight\"), py::arg(\"columns\"),\n        py::arg(\"ones\"), py::arg(\"kW\"), py::arg(\"kH\"), py::arg(\"dW\"),\n        py::arg(\"dH\"), py::arg(\"padW\"), py::arg(\"padH\"), py::arg(\"dilationW\"),\n        py::arg(\"dilationH\"), py::arg(\"group\"), py::arg(\"deformable_group\"),\n        py::arg(\"scale\"), py::arg(\"im2col_step\"));\n  m.def(\"deform_roi_pool_forward\", &deform_roi_pool_forward,\n        \"deform roi pool forward\", py::arg(\"input\"), py::arg(\"rois\"),\n        py::arg(\"offset\"), py::arg(\"output\"), py::arg(\"pooled_height\"),\n        py::arg(\"pooled_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"gamma\"));\n  m.def(\"deform_roi_pool_backward\", &deform_roi_pool_backward,\n        \"deform roi pool backward\", py::arg(\"grad_output\"), py::arg(\"input\"),\n        py::arg(\"rois\"), py::arg(\"offset\"), py::arg(\"grad_input\"),\n        py::arg(\"grad_offset\"), py::arg(\"pooled_height\"),\n        py::arg(\"pooled_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"gamma\"));\n  m.def(\"roipoint_pool3d_forward\", &roipoint_pool3d_forward,\n        \"roipoint_pool3d_forward\", py::arg(\"xyz\"), py::arg(\"boxes3d\"),\n        py::arg(\"pts_feature\"), py::arg(\"pooled_features\"),\n        py::arg(\"pooled_empty_flag\"));\n  m.def(\"sigmoid_focal_loss_forward\", &sigmoid_focal_loss_forward,\n        \"sigmoid_focal_loss_forward \", py::arg(\"input\"), py::arg(\"target\"),\n        py::arg(\"weight\"), py::arg(\"output\"), py::arg(\"gamma\"),\n        py::arg(\"alpha\"));\n  m.def(\"sigmoid_focal_loss_backward\", &sigmoid_focal_loss_backward,\n        \"sigmoid_focal_loss_backward\", py::arg(\"input\"), py::arg(\"target\"),\n        py::arg(\"weight\"), py::arg(\"grad_input\"), py::arg(\"gamma\"),\n        py::arg(\"alpha\"));\n  m.def(\"softmax_focal_loss_forward\", &softmax_focal_loss_forward,\n        \"softmax_focal_loss_forward\", py::arg(\"input\"), py::arg(\"target\"),\n        py::arg(\"weight\"), py::arg(\"output\"), py::arg(\"gamma\"),\n        py::arg(\"alpha\"));\n  m.def(\"softmax_focal_loss_backward\", &softmax_focal_loss_backward,\n        \"softmax_focal_loss_backward\", py::arg(\"input\"), py::arg(\"target\"),\n        py::arg(\"weight\"), py::arg(\"buff\"), py::arg(\"grad_input\"),\n        py::arg(\"gamma\"), py::arg(\"alpha\"));\n  m.def(\"three_interpolate_forward\", &three_interpolate_forward,\n        \"three_interpolate_forward\", py::arg(\"points_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"weight_tensor\"), py::arg(\"out_tensor\"),\n        py::arg(\"b\"), py::arg(\"c\"), py::arg(\"m\"), py::arg(\"n\"));\n  m.def(\"three_interpolate_backward\", &three_interpolate_backward,\n        \"three_interpolate_backward\", py::arg(\"grad_out_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"weight_tensor\"),\n        py::arg(\"grad_points_tensor\"), py::arg(\"b\"), py::arg(\"c\"), py::arg(\"n\"),\n        py::arg(\"m\"));\n  m.def(\"three_nn_forward\", &three_nn_forward, \"three_nn_forward\",\n        py::arg(\"unknown_tensor\"), py::arg(\"known_tensor\"),\n        py::arg(\"dist2_tensor\"), py::arg(\"idx_tensor\"), py::arg(\"b\"),\n        py::arg(\"n\"), py::arg(\"m\"));\n  m.def(\"bbox_overlaps\", &bbox_overlaps, \"bbox_overlaps\", py::arg(\"bboxes1\"),\n        py::arg(\"bboxes2\"), py::arg(\"ious\"), py::arg(\"mode\"),\n        py::arg(\"aligned\"), py::arg(\"offset\"));\n  m.def(\"group_points_forward\", &group_points_forward, \"group_points_forward\",\n        py::arg(\"points_tensor\"), py::arg(\"idx_tensor\"), py::arg(\"out_tensor\"),\n        py::arg(\"b\"), py::arg(\"c\"), py::arg(\"n\"), py::arg(\"npoints\"),\n        py::arg(\"nsample\"));\n  m.def(\"group_points_backward\", &group_points_backward,\n        \"group_points_backward\", py::arg(\"grad_out_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"grad_points_tensor\"), py::arg(\"b\"),\n        py::arg(\"c\"), py::arg(\"n\"), py::arg(\"npoints\"), py::arg(\"nsample\"));\n  m.def(\"stack_group_points_forward\", &stack_group_points_forward,\n        \"stack_group_points_forward\", py::arg(\"features_tensor\"),\n        py::arg(\"features_batch_cnt_tensor\"), py::arg(\"idx_tensor\"),\n        py::arg(\"idx_batch_cnt_tensor\"), py::arg(\"out_tensor\"), py::arg(\"b\"),\n        py::arg(\"c\"), py::arg(\"m\"), py::arg(\"nsample\"));\n  m.def(\"stack_group_points_backward\", &stack_group_points_backward,\n        \"stack_group_points_backward\", py::arg(\"grad_out_tensor\"),\n        py::arg(\"idx_tensor\"), py::arg(\"idx_batch_cnt_tensor\"),\n        py::arg(\"features_batch_cnt_tensor\"), py::arg(\"grad_features_tensor\"),\n        py::arg(\"b\"), py::arg(\"c\"), py::arg(\"m\"), py::arg(\"n\"),\n        py::arg(\"nsample\"));\n  m.def(\"knn_forward\", &knn_forward, \"knn_forward\", py::arg(\"b\"), py::arg(\"n\"),\n        py::arg(\"m\"), py::arg(\"nsample\"), py::arg(\"xyz_tensor\"),\n        py::arg(\"new_xyz_tensor\"), py::arg(\"idx_tensor\"),\n        py::arg(\"dist2_tensor\"));\n  m.def(\"iou3d_boxes_overlap_bev_forward\", &iou3d_boxes_overlap_bev_forward,\n        \"iou3d_boxes_overlap_bev_forward\", py::arg(\"boxes_a\"),\n        py::arg(\"boxes_b\"), py::arg(\"ans_iou\"));\n  m.def(\"iou3d_nms3d_forward\", &iou3d_nms3d_forward, \"iou3d_nms3d_forward\",\n        py::arg(\"boxes\"), py::arg(\"keep\"), py::arg(\"num_out\"),\n        py::arg(\"nms_overlap_thresh\"));\n  m.def(\"iou3d_nms3d_normal_forward\", &iou3d_nms3d_normal_forward,\n        \"iou3d_nms3d_normal_forward\", py::arg(\"boxes\"), py::arg(\"keep\"),\n        py::arg(\"num_out\"), py::arg(\"nms_overlap_thresh\"));\n  m.def(\"furthest_point_sampling_forward\", &furthest_point_sampling_forward,\n        \"furthest_point_sampling_forward\", py::arg(\"points_tensor\"),\n        py::arg(\"temp_tensor\"), py::arg(\"idx_tensor\"), py::arg(\"b\"),\n        py::arg(\"n\"), py::arg(\"m\"));\n  m.def(\"furthest_point_sampling_with_dist_forward\",\n        &furthest_point_sampling_with_dist_forward,\n        \"furthest_point_sampling_with_dist_forward\", py::arg(\"points_tensor\"),\n        py::arg(\"temp_tensor\"), py::arg(\"idx_tensor\"), py::arg(\"b\"),\n        py::arg(\"n\"), py::arg(\"m\"));\n  m.def(\"masked_im2col_forward\", &masked_im2col_forward,\n        \"masked_im2col_forward\", py::arg(\"im\"), py::arg(\"mask_h_idx\"),\n        py::arg(\"mask_w_idx\"), py::arg(\"col\"), py::arg(\"kernel_h\"),\n        py::arg(\"kernel_w\"), py::arg(\"pad_h\"), py::arg(\"pad_w\"));\n  m.def(\"masked_col2im_forward\", &masked_col2im_forward,\n        \"masked_col2im_forward\", py::arg(\"col\"), py::arg(\"mask_h_idx\"),\n        py::arg(\"mask_w_idx\"), py::arg(\"im\"), py::arg(\"height\"),\n        py::arg(\"width\"), py::arg(\"channels\"));\n  m.def(\"modulated_deform_conv_forward\", &modulated_deform_conv_forward,\n        \"modulated deform conv forward\", py::arg(\"input\"), py::arg(\"weight\"),\n        py::arg(\"bias\"), py::arg(\"ones\"), py::arg(\"offset\"), py::arg(\"mask\"),\n        py::arg(\"output\"), py::arg(\"columns\"), py::arg(\"kernel_h\"),\n        py::arg(\"kernel_w\"), py::arg(\"stride_h\"), py::arg(\"stride_w\"),\n        py::arg(\"pad_h\"), py::arg(\"pad_w\"), py::arg(\"dilation_h\"),\n        py::arg(\"dilation_w\"), py::arg(\"group\"), py::arg(\"deformable_group\"),\n        py::arg(\"with_bias\"));\n  m.def(\"modulated_deform_conv_backward\", &modulated_deform_conv_backward,\n        \"modulated deform conv backward\", py::arg(\"input\"), py::arg(\"weight\"),\n        py::arg(\"bias\"), py::arg(\"ones\"), py::arg(\"offset\"), py::arg(\"mask\"),\n        py::arg(\"columns\"), py::arg(\"grad_input\"), py::arg(\"grad_weight\"),\n        py::arg(\"grad_bias\"), py::arg(\"grad_offset\"), py::arg(\"grad_mask\"),\n        py::arg(\"grad_output\"), py::arg(\"kernel_h\"), py::arg(\"kernel_w\"),\n        py::arg(\"stride_h\"), py::arg(\"stride_w\"), py::arg(\"pad_h\"),\n        py::arg(\"pad_w\"), py::arg(\"dilation_h\"), py::arg(\"dilation_w\"),\n        py::arg(\"group\"), py::arg(\"deformable_group\"), py::arg(\"with_bias\"));\n  m.def(\"nms\", &nms, \"nms (CPU/CUDA) \", py::arg(\"boxes\"), py::arg(\"scores\"),\n        py::arg(\"iou_threshold\"), py::arg(\"offset\"));\n  m.def(\"softnms\", &softnms, \"softnms (CPU) \", py::arg(\"boxes\"),\n        py::arg(\"scores\"), py::arg(\"dets\"), py::arg(\"iou_threshold\"),\n        py::arg(\"sigma\"), py::arg(\"min_score\"), py::arg(\"method\"),\n        py::arg(\"offset\"));\n  m.def(\"nms_match\", &nms_match, \"nms_match (CPU) \", py::arg(\"dets\"),\n        py::arg(\"iou_threshold\"));\n  m.def(\"pixel_group\", &pixel_group, \"pixel group (CPU) \", py::arg(\"score\"),\n        py::arg(\"mask\"), py::arg(\"embedding\"), py::arg(\"kernel_label\"),\n        py::arg(\"kernel_contour\"), py::arg(\"kernel_region_label\"),\n        py::arg(\"distance_threshold\"));\n  m.def(\"contour_expand\", &contour_expand, \"contour exapnd (CPU) \",\n        py::arg(\"kernel_mask\"), py::arg(\"internal_kernel_label\"),\n        py::arg(\"min_kernel_area\"), py::arg(\"kernel_num\"));\n  m.def(\"roi_align_forward\", &roi_align_forward, \"roi_align forward\",\n        py::arg(\"input\"), py::arg(\"rois\"), py::arg(\"output\"),\n        py::arg(\"argmax_y\"), py::arg(\"argmax_x\"), py::arg(\"aligned_height\"),\n        py::arg(\"aligned_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"pool_mode\"), py::arg(\"aligned\"));\n  m.def(\"roi_align_backward\", &roi_align_backward, \"roi_align backward\",\n        py::arg(\"grad_output\"), py::arg(\"rois\"), py::arg(\"argmax_y\"),\n        py::arg(\"argmax_x\"), py::arg(\"grad_input\"), py::arg(\"aligned_height\"),\n        py::arg(\"aligned_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"pool_mode\"), py::arg(\"aligned\"));\n  m.def(\"roi_pool_forward\", &roi_pool_forward, \"roi_pool forward\",\n        py::arg(\"input\"), py::arg(\"rois\"), py::arg(\"output\"), py::arg(\"argmax\"),\n        py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"));\n  m.def(\"roi_pool_backward\", &roi_pool_backward, \"roi_pool backward\",\n        py::arg(\"grad_output\"), py::arg(\"rois\"), py::arg(\"argmax\"),\n        py::arg(\"grad_input\"), py::arg(\"pooled_height\"),\n        py::arg(\"pooled_width\"), py::arg(\"spatial_scale\"));\n  m.def(\"sync_bn_forward_mean\", &sync_bn_forward_mean, \"sync_bn forward_mean\",\n        py::arg(\"input\"), py::arg(\"mean\"));\n  m.def(\"sync_bn_forward_var\", &sync_bn_forward_var, \"sync_bn forward_var\",\n        py::arg(\"input\"), py::arg(\"mean\"), py::arg(\"var\"));\n  m.def(\"sync_bn_forward_output\", &sync_bn_forward_output,\n        \"sync_bn forward_output\", py::arg(\"input\"), py::arg(\"mean\"),\n        py::arg(\"var\"), py::arg(\"weight\"), py::arg(\"bias\"),\n        py::arg(\"running_mean\"), py::arg(\"running_var\"), py::arg(\"norm\"),\n        py::arg(\"std\"), py::arg(\"output\"), py::arg(\"eps\"), py::arg(\"momentum\"),\n        py::arg(\"group_size\"));\n  m.def(\"sync_bn_backward_param\", &sync_bn_backward_param,\n        \"sync_bn backward_param\", py::arg(\"grad_output\"), py::arg(\"norm\"),\n        py::arg(\"grad_weight\"), py::arg(\"grad_bias\"));\n  m.def(\"sync_bn_backward_data\", &sync_bn_backward_data,\n        \"sync_bn backward_data\", py::arg(\"grad_output\"), py::arg(\"weight\"),\n        py::arg(\"grad_weight\"), py::arg(\"grad_bias\"), py::arg(\"norm\"),\n        py::arg(\"std\"), py::arg(\"grad_input\"));\n  m.def(\"get_indice_pairs_2d_forward\", &get_indice_pairs_forward<2>,\n        \"get_indice_pairs_2d_forward\", py::arg(\"indices\"), py::arg(\"batchSize\"),\n        py::arg(\"outSpatialShape\"), py::arg(\"spatialShape\"),\n        py::arg(\"kernelSize\"), py::arg(\"stride\"), py::arg(\"padding\"),\n        py::arg(\"dilation\"), py::arg(\"outPadding\"), py::arg(\"_subM\"),\n        py::arg(\"_transpose\"));\n  m.def(\"get_indice_pairs_3d_forward\", &get_indice_pairs_forward<3>,\n        \"get_indice_pairs_3d_forward\", py::arg(\"indices\"), py::arg(\"batchSize\"),\n        py::arg(\"outSpatialShape\"), py::arg(\"spatialShape\"),\n        py::arg(\"kernelSize\"), py::arg(\"stride\"), py::arg(\"padding\"),\n        py::arg(\"dilation\"), py::arg(\"outPadding\"), py::arg(\"_subM\"),\n        py::arg(\"_transpose\"));\n  m.def(\"get_indice_pairs_4d_forward\", &get_indice_pairs_forward<4>,\n        \"get_indice_pairs_4d_forward\", py::arg(\"indices\"), py::arg(\"batchSize\"),\n        py::arg(\"outSpatialShape\"), py::arg(\"spatialShape\"),\n        py::arg(\"kernelSize\"), py::arg(\"stride\"), py::arg(\"padding\"),\n        py::arg(\"dilation\"), py::arg(\"outPadding\"), py::arg(\"_subM\"),\n        py::arg(\"_transpose\"));\n  m.def(\"get_indice_pairs_2d_backward\", &get_indice_pairs_backward<2>,\n        \"get_indice_pairs_2d_backward\", py::arg(\"indices\"), py::arg(\"gridOut\"),\n        py::arg(\"batchSize\"), py::arg(\"outSpatialShape\"),\n        py::arg(\"spatialShape\"), py::arg(\"kernelSize\"), py::arg(\"stride\"),\n        py::arg(\"padding\"), py::arg(\"dilation\"), py::arg(\"outPadding\"),\n        py::arg(\"_subM\"), py::arg(\"_transpose\"));\n  m.def(\"get_indice_pairs_3d_backward\", &get_indice_pairs_backward<3>,\n        \"get_indice_pairs_3d_backward\", py::arg(\"indices\"), py::arg(\"gridOut\"),\n        py::arg(\"batchSize\"), py::arg(\"outSpatialShape\"),\n        py::arg(\"spatialShape\"), py::arg(\"kernelSize\"), py::arg(\"stride\"),\n        py::arg(\"padding\"), py::arg(\"dilation\"), py::arg(\"outPadding\"),\n        py::arg(\"_subM\"), py::arg(\"_transpose\"));\n  m.def(\"indice_conv_forward\", &indice_conv_forward, \"indice_conv_forward\",\n        py::arg(\"features\"), py::arg(\"filters\"), py::arg(\"indicePairs\"),\n        py::arg(\"indiceNum\"), py::arg(\"numActOut\"), py::arg(\"_inverse\"),\n        py::arg(\"_subM\"));\n  m.def(\"indice_conv_backward\", &indice_conv_backward, \"indice_conv_backward\",\n        py::arg(\"features\"), py::arg(\"filters\"), py::arg(\"outGrad\"),\n        py::arg(\"indicePairs\"), py::arg(\"indiceNum\"), py::arg(\"_inverse\"),\n        py::arg(\"_subM\"));\n  m.def(\"fused_indice_conv_forward\", &fused_indice_conv_batchnorm_forward,\n        \"fused_indice_conv_forward\", py::arg(\"features\"), py::arg(\"filters\"),\n        py::arg(\"bias\"), py::arg(\"indicePairs\"), py::arg(\"indiceNum\"),\n        py::arg(\"numActOut\"), py::arg(\"_inverse\"), py::arg(\"_subM\"));\n  m.def(\"indice_maxpool_forward\", &indice_maxpool_forward,\n        \"indice_maxpool_forward\", py::arg(\"features\"), py::arg(\"indicePairs\"),\n        py::arg(\"indiceNum\"), py::arg(\"numAct\"));\n  m.def(\"indice_maxpool_backward\", &indice_maxpool_backward,\n        \"indice_maxpool_backward\", py::arg(\"features\"), py::arg(\"outFeatures\"),\n        py::arg(\"outGrad\"), py::arg(\"indicePairs\"), py::arg(\"indiceNum\"));\n  m.def(\"psamask_forward\", &psamask_forward, \"PSAMASK forward (CPU/CUDA)\",\n        py::arg(\"input\"), py::arg(\"output\"), py::arg(\"psa_type\"),\n        py::arg(\"num_\"), py::arg(\"h_feature\"), py::arg(\"w_feature\"),\n        py::arg(\"h_mask\"), py::arg(\"w_mask\"), py::arg(\"half_h_mask\"),\n        py::arg(\"half_w_mask\"));\n  m.def(\"psamask_backward\", &psamask_backward, \"PSAMASK backward (CPU/CUDA)\",\n        py::arg(\"grad_output\"), py::arg(\"grad_input\"), py::arg(\"psa_type\"),\n        py::arg(\"num_\"), py::arg(\"h_feature\"), py::arg(\"w_feature\"),\n        py::arg(\"h_mask\"), py::arg(\"w_mask\"), py::arg(\"half_h_mask\"),\n        py::arg(\"half_w_mask\"));\n  m.def(\"tin_shift_forward\", &tin_shift_forward, \"tin_shift forward\",\n        py::arg(\"input\"), py::arg(\"shift\"), py::arg(\"output\"));\n  m.def(\"tin_shift_backward\", &tin_shift_backward, \"tin_shift backward\",\n        py::arg(\"grad_output\"), py::arg(\"shift\"), py::arg(\"grad_input\"));\n  m.def(\"box_iou_rotated\", &box_iou_rotated, \"IoU for rotated boxes\",\n        py::arg(\"boxes1\"), py::arg(\"boxes2\"), py::arg(\"ious\"),\n        py::arg(\"mode_flag\"), py::arg(\"aligned\"));\n  m.def(\"nms_rotated\", &nms_rotated, \"NMS for rotated boxes\", py::arg(\"dets\"),\n        py::arg(\"scores\"), py::arg(\"order\"), py::arg(\"dets_sorted\"),\n        py::arg(\"labels\"), py::arg(\"iou_threshold\"), py::arg(\"multi_label\"));\n  m.def(\"ball_query_forward\", &ball_query_forward, \"ball_query_forward\",\n        py::arg(\"new_xyz_tensor\"), py::arg(\"xyz_tensor\"), py::arg(\"idx_tensor\"),\n        py::arg(\"b\"), py::arg(\"n\"), py::arg(\"m\"), py::arg(\"min_radius\"),\n        py::arg(\"max_radius\"), py::arg(\"nsample\"));\n  m.def(\"stack_ball_query_forward\", &stack_ball_query_forward,\n        \"stack_ball_query_forward\", py::arg(\"new_xyz_tensor\"),\n        py::arg(\"new_xyz_batch_cnt\"), py::arg(\"xyz_tensor\"),\n        py::arg(\"xyz_batch_cnt\"), py::arg(\"idx_tensor\"), py::arg(\"max_radius\"),\n        py::arg(\"nsample\"));\n  m.def(\"roi_align_rotated_forward\", &roi_align_rotated_forward,\n        \"roi_align_rotated forward\", py::arg(\"input\"), py::arg(\"rois\"),\n        py::arg(\"output\"), py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"), py::arg(\"sampling_ratio\"), py::arg(\"aligned\"),\n        py::arg(\"clockwise\"));\n  m.def(\"roi_align_rotated_backward\", &roi_align_rotated_backward,\n        \"roi_align_rotated backward\", py::arg(\"rois\"), py::arg(\"grad_input\"),\n        py::arg(\"grad_output\"), py::arg(\"pooled_height\"),\n        py::arg(\"pooled_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"aligned\"), py::arg(\"clockwise\"));\n  m.def(\"dynamic_point_to_voxel_forward\", &dynamic_point_to_voxel_forward,\n        \"dynamic_point_to_voxel_forward\", py::arg(\"feats\"), py::arg(\"coors\"),\n        py::arg(\"reduce_type\"));\n  m.def(\"dynamic_point_to_voxel_backward\", &dynamic_point_to_voxel_backward,\n        \"dynamic_point_to_voxel_backward\", py::arg(\"grad_feats\"),\n        py::arg(\"grad_reduced_feats\"), py::arg(\"feats\"),\n        py::arg(\"reduced_feats\"), py::arg(\"coors_idx\"), py::arg(\"reduce_count\"),\n        py::arg(\"reduce_type\"));\n  m.def(\"hard_voxelize_forward\", &hard_voxelize_forward,\n        \"hard_voxelize_forward\", py::arg(\"points\"), py::arg(\"voxel_size\"),\n        py::arg(\"coors_range\"), py::arg(\"voxels\"), py::arg(\"coors\"),\n        py::arg(\"num_points_per_voxel\"), py::arg(\"voxel_num\"),\n        py::arg(\"max_points\"), py::arg(\"max_voxels\"), py::arg(\"NDim\"),\n        py::arg(\"deterministic\"));\n  m.def(\"dynamic_voxelize_forward\", &dynamic_voxelize_forward,\n        \"dynamic_voxelize_forward\", py::arg(\"points\"), py::arg(\"voxel_size\"),\n        py::arg(\"coors_range\"), py::arg(\"coors\"), py::arg(\"NDim\"));\n  m.def(\"ms_deform_attn_forward\", &ms_deform_attn_forward,\n        \"forward function of multi-scale deformable attention\",\n        py::arg(\"value\"), py::arg(\"value_spatial_shapes\"),\n        py::arg(\"value_level_start_index\"), py::arg(\"sampling_locations\"),\n        py::arg(\"attention_weights\"), py::arg(\"im2col_step\"));\n  m.def(\"ms_deform_attn_backward\", &ms_deform_attn_backward,\n        \"backward function of multi-scale deformable attention\",\n        py::arg(\"value\"), py::arg(\"value_spatial_shapes\"),\n        py::arg(\"value_level_start_index\"), py::arg(\"sampling_locations\"),\n        py::arg(\"attention_weights\"), py::arg(\"grad_output\"),\n        py::arg(\"grad_value\"), py::arg(\"grad_sampling_loc\"),\n        py::arg(\"grad_attn_weight\"), py::arg(\"im2col_step\"));\n  m.def(\"border_align_forward\", &border_align_forward,\n        \"forward function of border_align\", py::arg(\"input\"), py::arg(\"boxes\"),\n        py::arg(\"output\"), py::arg(\"argmax_idx\"), py::arg(\"pool_size\"));\n  m.def(\"border_align_backward\", &border_align_backward,\n        \"backward function of border_align\", py::arg(\"grad_output\"),\n        py::arg(\"boxes\"), py::arg(\"argmax_idx\"), py::arg(\"grad_input\"),\n        py::arg(\"pool_size\"));\n  m.def(\"correlation_forward\", &correlation_forward, \"Correlation forward\",\n        py::arg(\"input1\"), py::arg(\"input2\"), py::arg(\"output\"), py::arg(\"kH\"),\n        py::arg(\"kW\"), py::arg(\"patchH\"), py::arg(\"patchW\"), py::arg(\"padH\"),\n        py::arg(\"padW\"), py::arg(\"dilationH\"), py::arg(\"dilationW\"),\n        py::arg(\"dilation_patchH\"), py::arg(\"dilation_patchW\"), py::arg(\"dH\"),\n        py::arg(\"dW\"));\n  m.def(\"correlation_backward\", &correlation_backward, \"Correlation backward\",\n        py::arg(\"grad_output\"), py::arg(\"input1\"), py::arg(\"input2\"),\n        py::arg(\"grad_input1\"), py::arg(\"grad_input2\"), py::arg(\"kH\"),\n        py::arg(\"kW\"), py::arg(\"patchH\"), py::arg(\"patchW\"), py::arg(\"padH\"),\n        py::arg(\"padW\"), py::arg(\"dilationH\"), py::arg(\"dilationW\"),\n        py::arg(\"dilation_patchH\"), py::arg(\"dilation_patchW\"), py::arg(\"dH\"),\n        py::arg(\"dW\"));\n  m.def(\"points_in_boxes_cpu_forward\", &points_in_boxes_cpu_forward,\n        \"points_in_boxes_cpu_forward\", py::arg(\"boxes_tensor\"),\n        py::arg(\"pts_tensor\"), py::arg(\"pts_indices_tensor\"));\n  m.def(\"points_in_boxes_part_forward\", &points_in_boxes_part_forward,\n        \"points_in_boxes_part_forward\", py::arg(\"boxes_tensor\"),\n        py::arg(\"pts_tensor\"), py::arg(\"box_idx_of_points_tensor\"));\n  m.def(\"points_in_boxes_all_forward\", &points_in_boxes_all_forward,\n        \"points_in_boxes_all_forward\", py::arg(\"boxes_tensor\"),\n        py::arg(\"pts_tensor\"), py::arg(\"box_idx_of_points_tensor\"));\n  m.def(\"roiaware_pool3d_forward\", &roiaware_pool3d_forward,\n        \"roiaware_pool3d_forward\", py::arg(\"rois\"), py::arg(\"pts\"),\n        py::arg(\"pts_feature\"), py::arg(\"argmax\"), py::arg(\"pts_idx_of_voxels\"),\n        py::arg(\"pooled_features\"), py::arg(\"pool_method\"));\n  m.def(\"roiaware_pool3d_backward\", &roiaware_pool3d_backward,\n        \"roiaware_pool3d_backward\", py::arg(\"pts_idx_of_voxels\"),\n        py::arg(\"argmax\"), py::arg(\"grad_out\"), py::arg(\"grad_in\"),\n        py::arg(\"pool_method\"));\n  m.def(\"rotated_feature_align_forward\", &rotated_feature_align_forward,\n        \"Feature Refine forward (CUDA)\", py::arg(\"features\"),\n        py::arg(\"best_bboxes\"), py::arg(\"output\"), py::arg(\"spatial_scale\"),\n        py::arg(\"points\"));\n  m.def(\"rotated_feature_align_backward\", &rotated_feature_align_backward,\n        \"Feature Refine backward (CUDA)\", py::arg(\"top_grad\"),\n        py::arg(\"best_bboxes\"), py::arg(\"bottom_grad\"),\n        py::arg(\"spatial_scale\"), py::arg(\"points\"));\n  m.def(\"riroi_align_rotated_forward\", &riroi_align_rotated_forward,\n        \"riroi_align_rotated forward\", py::arg(\"features\"), py::arg(\"rois\"),\n        py::arg(\"output\"), py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"), py::arg(\"num_samples\"),\n        py::arg(\"num_orientations\"), py::arg(\"clockwise\"));\n  m.def(\"riroi_align_rotated_backward\", &riroi_align_rotated_backward,\n        \"riroi_align_rotated backward\", py::arg(\"top_grad\"), py::arg(\"rois\"),\n        py::arg(\"bottom_grad\"), py::arg(\"pooled_height\"),\n        py::arg(\"pooled_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"num_samples\"), py::arg(\"num_orientations\"),\n        py::arg(\"clockwise\"));\n  m.def(\"points_in_polygons_forward\", &points_in_polygons_forward,\n        \"points_in_polygons_forward\", py::arg(\"points\"), py::arg(\"polygons\"),\n        py::arg(\"output\"));\n  m.def(\"min_area_polygons\", &min_area_polygons, \"min_area_polygons\",\n        py::arg(\"pointsets\"), py::arg(\"polygons\"));\n  m.def(\"active_rotated_filter_forward\", &active_rotated_filter_forward,\n        \"active_rotated_filter_forward\", py::arg(\"input\"), py::arg(\"indices\"),\n        py::arg(\"output\"));\n  m.def(\"active_rotated_filter_backward\", &active_rotated_filter_backward,\n        \"active_rotated_filter_backward\", py::arg(\"grad_out\"),\n        py::arg(\"indices\"), py::arg(\"grad_in\"));\n  m.def(\"convex_iou\", &convex_iou, \"convex_iou\", py::arg(\"pointsets\"),\n        py::arg(\"polygons\"), py::arg(\"ious\"));\n  m.def(\"convex_giou\", &convex_giou, \"convex_giou\", py::arg(\"pointsets\"),\n        py::arg(\"polygons\"), py::arg(\"output\"));\n  m.def(\"diff_iou_rotated_sort_vertices_forward\",\n        &diff_iou_rotated_sort_vertices_forward,\n        \"diff_iou_rotated_sort_vertices_forward\", py::arg(\"vertices\"),\n        py::arg(\"mask\"), py::arg(\"num_valid\"));\n  m.def(\"chamfer_distance_forward\", &chamfer_distance_forward,\n        \"chamfer_distance_forward\", py::arg(\"xyz1\"), py::arg(\"xyz2\"),\n        py::arg(\"dist1\"), py::arg(\"dist2\"), py::arg(\"idx1\"), py::arg(\"idx2\"));\n  m.def(\"chamfer_distance_backward\", &chamfer_distance_backward,\n        \"chamfer_distance_backward\", py::arg(\"xyz1\"), py::arg(\"xyz2\"),\n        py::arg(\"idx1\"), py::arg(\"idx2\"), py::arg(\"graddist1\"),\n        py::arg(\"graddist2\"), py::arg(\"gradxyz1\"), py::arg(\"gradxyz2\"));\n  m.def(\"prroi_pool_forward\", &prroi_pool_forward, \"prroi_pool forward\",\n        py::arg(\"input\"), py::arg(\"rois\"), py::arg(\"output\"),\n        py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"));\n  m.def(\"prroi_pool_backward\", &prroi_pool_backward, \"prroi_pool_backward\",\n        py::arg(\"grad_output\"), py::arg(\"rois\"), py::arg(\"grad_input\"),\n        py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"));\n  m.def(\"prroi_pool_coor_backward\", &prroi_pool_coor_backward,\n        \"prroi_pool_coor_backward\", py::arg(\"output\"), py::arg(\"grad_output\"),\n        py::arg(\"input\"), py::arg(\"rois\"), py::arg(\"grad_rois\"),\n        py::arg(\"pooled_height\"), py::arg(\"pooled_width\"),\n        py::arg(\"spatial_scale\"));\n  m.def(\"bias_act\", &bias_act, \"bias_act (CUDA)\", py::arg(\"input\"),\n        py::arg(\"bias\"), py::arg(\"xref\"), py::arg(\"yref\"), py::arg(\"dy\"),\n        py::arg(\"grad\"), py::arg(\"dim\"), py::arg(\"act\"), py::arg(\"alpha\"),\n        py::arg(\"gain\"), py::arg(\"clamp\"));\n  m.def(\"filtered_lrelu\", &filtered_lrelu, \"filtered_lrelu (CUDA)\",\n        py::arg(\"x\"), py::arg(\"fu\"), py::arg(\"fd\"), py::arg(\"b\"), py::arg(\"si\"),\n        py::arg(\"up\"), py::arg(\"down\"), py::arg(\"px0\"), py::arg(\"px1\"),\n        py::arg(\"py0\"), py::arg(\"py1\"), py::arg(\"sx\"), py::arg(\"sy\"),\n        py::arg(\"gain\"), py::arg(\"slope\"), py::arg(\"clamp\"),\n        py::arg(\"flip_filters\"), py::arg(\"writeSigns\"));\n  m.def(\"filtered_lrelu_act_\", &filtered_lrelu_act_,\n        \"filtered_lrelu_act_ (CUDA)\", py::arg(\"x\"), py::arg(\"si\"),\n        py::arg(\"sx\"), py::arg(\"sy\"), py::arg(\"gain\"), py::arg(\"slope\"),\n        py::arg(\"clamp\"), py::arg(\"writeSigns\"));\n  m.def(\"box_iou_quadri\", &box_iou_quadri, \"IoU for quadrilateral boxes\",\n        py::arg(\"boxes1\"), py::arg(\"boxes2\"), py::arg(\"ious\"),\n        py::arg(\"mode_flag\"), py::arg(\"aligned\"));\n  m.def(\"nms_quadri\", &nms_quadri, \"NMS for quadrilateral boxes\",\n        py::arg(\"dets\"), py::arg(\"scores\"), py::arg(\"order\"),\n        py::arg(\"dets_sorted\"), py::arg(\"iou_threshold\"),\n        py::arg(\"multi_label\"));\n  m.def(\"bezier_align_forward\", &bezier_align_forward, \"bezier_align forward\",\n        py::arg(\"input\"), py::arg(\"rois\"), py::arg(\"output\"),\n        py::arg(\"aligned_height\"), py::arg(\"aligned_width\"),\n        py::arg(\"spatial_scale\"), py::arg(\"sampling_ratio\"),\n        py::arg(\"aligned\"));\n  m.def(\"bezier_align_backward\", &bezier_align_backward,\n        \"bezier_align backward\", py::arg(\"grad_output\"), py::arg(\"rois\"),\n        py::arg(\"grad_input\"), py::arg(\"aligned_height\"),\n        py::arg(\"aligned_width\"), py::arg(\"spatial_scale\"),\n        py::arg(\"sampling_ratio\"), py::arg(\"aligned\"));\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid riroi_align_rotated_forward_impl(Tensor features, Tensor rois,\n                                      Tensor output, int pooled_height,\n                                      int pooled_width, float spatial_scale,\n                                      int num_samples, int num_orientations,\n                                      bool clockwise) {\n  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,\n                       pooled_height, pooled_width, spatial_scale, num_samples,\n                       num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                       Tensor bottom_grad, int pooled_height,\n                                       int pooled_width, float spatial_scale,\n                                       int num_samples, int num_orientations,\n                                       bool clockwise) {\n  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,\n                       bottom_grad, pooled_height, pooled_width, spatial_scale,\n                       num_samples, num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,\n                                 int pooled_height, int pooled_width,\n                                 float spatial_scale, int num_samples,\n                                 int num_orientations, bool clockwise) {\n  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,\n                                   pooled_width, spatial_scale, num_samples,\n                                   num_orientations, clockwise);\n}\n\nvoid riroi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                  Tensor bottom_grad, int pooled_height,\n                                  int pooled_width, float spatial_scale,\n                                  int num_samples, int num_orientations,\n                                  bool clockwise) {\n  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,\n                                    pooled_width, spatial_scale, num_samples,\n                                    num_orientations, clockwise);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/roi_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n#include <torch/csrc/utils/pybind.h>\n\n#include \"csrc_dipu/base/basedef.h\"\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n#include \"csrc_dipu/runtime/device/deviceapis.h\"\n#include \"csrc_dipu/utils/helpfunc.hpp\"\n\nusing dipu::VENDOR_TYPE;\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nvoid roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,\n                            Tensor argmax_y, Tensor argmax_x,\n                            int aligned_height, int aligned_width,\n                            float spatial_scale, int sampling_ratio,\n                            int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,\n                       argmax_x, aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, pool_mode, aligned);\n}\n\nvoid roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                             Tensor argmax_x, Tensor grad_input,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,\n                       argmax_x, grad_input, aligned_height, aligned_width,\n                       spatial_scale, sampling_ratio, pool_mode, aligned);\n}\n\n#ifdef MMCV_WITH_DIOPI\nvoid roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,\n                             Tensor argmax_y, Tensor argmax_x,\n                             int aligned_height, int aligned_width,\n                             float spatial_scale, int sampling_ratio,\n                             int pool_mode, bool aligned) {\n  auto input_p = toDiopiTensorHandle(input);\n  diopiDevice_t device;\n  diopiGetTensorDevice(input_p, &device);\n  if (device == diopi_host) {\n    roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,\n                           aligned_height, aligned_width, spatial_scale,\n                           sampling_ratio, pool_mode, aligned);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto rois_p = toDiopiTensorHandle(rois);\n  auto out_p = toDiopiTensorHandle(output);\n  auto argmax_y_p = toDiopiTensorHandle(argmax_y);\n  auto argmax_x_p = toDiopiTensorHandle(argmax_x);\n  bool is_mock_cuda = input.device().type() == dipu::DIPU_DEVICE_TYPE;\n  if (is_mock_cuda && reinterpret_cast<void *>(diopiRoiAlignMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiRoiAlignMmcv(\n          ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,\n          aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiRoiAlignMmcv(\n          ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,\n          aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op roi_align_forward\";\n  auto input_cpu = input.cpu();\n  auto rois_cpu = rois.cpu();\n  auto out_cpu = output.cpu();\n  auto argmax_y_cpu = argmax_y.cpu();\n  auto argmax_x_cpu = argmax_x.cpu();\n  roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu,\n                         argmax_x_cpu, aligned_height, aligned_width,\n                         spatial_scale, sampling_ratio, pool_mode, aligned);\n  output.copy_(out_cpu);\n}\n\nvoid roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                              Tensor argmax_x, Tensor grad_input,\n                              int aligned_height, int aligned_width,\n                              float spatial_scale, int sampling_ratio,\n                              int pool_mode, bool aligned) {\n  auto grad_output_ = toDiopiTensorHandle(grad_output);\n  diopiDevice_t device;\n  diopiGetTensorDevice(grad_output_, &device);\n  if (device == diopi_host) {\n    roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,\n                            aligned_height, aligned_width, spatial_scale,\n                            sampling_ratio, pool_mode, aligned);\n    return;\n  }\n  auto rois_ = toDiopiTensorHandle(rois);\n  auto argmax_y_ = toDiopiTensorHandle(argmax_y);\n  auto argmax_x_ = toDiopiTensorHandle(argmax_x);\n  auto grad_input_ = toDiopiTensorHandle(grad_input);\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  bool is_mock_cuda = grad_output.device().type() == dipu::DIPU_DEVICE_TYPE;\n  if (is_mock_cuda &&\n      reinterpret_cast<void *>(diopiRoiAlignBackwardMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,\n                                           argmax_y_, argmax_x_, aligned_height,\n                                           aligned_width, sampling_ratio,\n                                           pool_mode, spatial_scale, aligned);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,\n                                           argmax_y_, argmax_x_, aligned_height,\n                                           aligned_width, sampling_ratio,\n                                           pool_mode, spatial_scale, aligned);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op roi_align_backward\";\n  auto grad_output_cpu = grad_output.cpu();\n  auto rois_cpu = rois.cpu();\n  auto argmax_y_cpu = argmax_y.cpu();\n  auto argmax_x_cpu = argmax_x.cpu();\n  auto grad_input_cpu = grad_input.cpu();\n  roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu,\n                          grad_input_cpu, aligned_height, aligned_width,\n                          spatial_scale, sampling_ratio, pool_mode, aligned);\n  grad_input.copy_(grad_input_cpu);\n}\n#endif\n\nvoid roi_align_forward(Tensor input, Tensor rois, Tensor output,\n                       Tensor argmax_y, Tensor argmax_x, int aligned_height,\n                       int aligned_width, float spatial_scale,\n                       int sampling_ratio, int pool_mode, bool aligned) {\n#ifdef MMCV_WITH_DIOPI\n  roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n#else\n  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,\n                         aligned_height, aligned_width, spatial_scale,\n                         sampling_ratio, pool_mode, aligned);\n#endif\n}\n\nvoid roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,\n                        Tensor argmax_x, Tensor grad_input, int aligned_height,\n                        int aligned_width, float spatial_scale,\n                        int sampling_ratio, int pool_mode, bool aligned) {\n#ifdef MMCV_WITH_DIOPI\n  roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input,\n                           aligned_height, aligned_width, spatial_scale,\n                           sampling_ratio, pool_mode, aligned);\n#else\n  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,\n                          aligned_height, aligned_width, spatial_scale,\n                          sampling_ratio, pool_mode, aligned);\n#endif\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/roi_align_rotated.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,\n                                    int aligned_height, int aligned_width,\n                                    float spatial_scale, int sampling_ratio,\n                                    bool aligned, bool clockwise) {\n  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,\n                       aligned_height, aligned_width, spatial_scale,\n                       sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,\n                                     Tensor bottom_grad, int aligned_height,\n                                     int aligned_width, float spatial_scale,\n                                     int sampling_ratio, bool aligned,\n                                     bool clockwise) {\n  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,\n                       bottom_grad, aligned_height, aligned_width,\n                       spatial_scale, sampling_ratio, aligned, clockwise);\n}\n\nvoid roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,\n                               int aligned_height, int aligned_width,\n                               float spatial_scale, int sampling_ratio,\n                               bool aligned, bool clockwise) {\n  roi_align_rotated_forward_impl(input, rois, output, aligned_height,\n                                 aligned_width, spatial_scale, sampling_ratio,\n                                 aligned, clockwise);\n}\n\nvoid roi_align_rotated_backward(Tensor top_grad, Tensor rois,\n                                Tensor bottom_grad, int aligned_height,\n                                int aligned_width, float spatial_scale,\n                                int sampling_ratio, bool aligned,\n                                bool clockwise) {\n  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,\n                                  aligned_width, spatial_scale, sampling_ratio,\n                                  aligned, clockwise);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/roi_pool.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,\n                           Tensor argmax, int pooled_height, int pooled_width,\n                           float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,\n                       pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,\n                            Tensor grad_input, int pooled_height,\n                            int pooled_width, float spatial_scale) {\n  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,\n                       grad_input, pooled_height, pooled_width, spatial_scale);\n}\n\nvoid roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,\n                      int pooled_height, int pooled_width,\n                      float spatial_scale) {\n  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,\n                        pooled_width, spatial_scale);\n}\n\nvoid roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,\n                       Tensor grad_input, int pooled_height, int pooled_width,\n                       float spatial_scale) {\n  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,\n                         pooled_width, spatial_scale);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp",
    "content": "#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,\n                                  int max_pts_each_voxel, int out_x, int out_y,\n                                  int out_z, const Tensor rois,\n                                  const Tensor pts, const Tensor pts_feature,\n                                  Tensor argmax, Tensor pts_idx_of_voxels,\n                                  Tensor pooled_features, int pool_method) {\n  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,\n                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,\n                       pts, pts_feature, argmax, pts_idx_of_voxels,\n                       pooled_features, pool_method);\n}\n\nvoid roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,\n                                   int out_z, int channels,\n                                   int max_pts_each_voxel,\n                                   const Tensor pts_idx_of_voxels,\n                                   const Tensor argmax, const Tensor grad_out,\n                                   Tensor grad_in, int pool_method) {\n  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,\n                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,\n                       argmax, grad_out, grad_in, pool_method);\n}\n\nvoid roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,\n                             Tensor argmax, Tensor pts_idx_of_voxels,\n                             Tensor pooled_features, int pool_method) {\n  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR\n  // coordinate\n  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate\n  // params pts_feature: (npoints, C)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params pooled_features: (N, out_x, out_y, out_z, C)\n  // params pool_method: 0: max_pool 1: avg_pool\n  int boxes_num = rois.size(0);\n  int pts_num = pts.size(0);\n  int channels = pts_feature.size(1);\n  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter\n  int out_x = pts_idx_of_voxels.size(1);\n  int out_y = pts_idx_of_voxels.size(2);\n  int out_z = pts_idx_of_voxels.size(3);\n  assert((out_x < 256) && (out_y < 256) &&\n         (out_z < 256));  // we encode index with 8bit\n\n  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,\n                               out_x, out_y, out_z, rois, pts, pts_feature,\n                               argmax, pts_idx_of_voxels, pooled_features,\n                               pool_method);\n}\n\nvoid roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,\n                              Tensor grad_out, Tensor grad_in,\n                              int pool_method) {\n  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)\n  // params argmax: (N, out_x, out_y, out_z, C)\n  // params grad_out: (N, out_x, out_y, out_z, C)\n  // params grad_in: (npoints, C), return value\n  // params pool_method: 0: max_pool 1: avg_pool\n  int boxes_num = pts_idx_of_voxels.size(0);\n  int out_x = pts_idx_of_voxels.size(1);\n  int out_y = pts_idx_of_voxels.size(2);\n  int out_z = pts_idx_of_voxels.size(3);\n  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter\n  int channels = grad_out.size(4);\n\n  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,\n                                max_pts_each_voxel, pts_idx_of_voxels, argmax,\n                                grad_out, grad_in, pool_method);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp",
    "content": "/*\nModified from\nhttps://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp\nPoint cloud feature pooling\nWritten by Shaoshuai Shi\nAll Rights Reserved 2018.\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,\n                                  int feature_in_len, int sampled_pts_num,\n                                  const Tensor xyz, const Tensor boxes3d,\n                                  const Tensor pts_feature,\n                                  Tensor pooled_features,\n                                  Tensor pooled_empty_flag) {\n  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,\n                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,\n                       pts_feature, pooled_features, pooled_empty_flag);\n}\n\nvoid roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,\n                             Tensor pooled_features, Tensor pooled_empty_flag) {\n  // params xyz: (B, N, 3)\n  // params boxes3d: (B, M, 7)\n  // params pts_feature: (B, N, C)\n  // params pooled_features: (B, M, 512, 3+C)\n  // params pooled_empty_flag: (B, M)\n  int batch_size = xyz.size(0);\n  int pts_num = xyz.size(1);\n  int boxes_num = boxes3d.size(1);\n  int feature_in_len = pts_feature.size(2);\n  int sampled_pts_num = pooled_features.size(2);\n\n  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,\n                               sampled_pts_num, xyz, boxes3d, pts_feature,\n                               pooled_features, pooled_empty_flag);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/rotated_feature_align.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n// Modified from\n// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid rotated_feature_align_forward_impl(const Tensor features,\n                                        const Tensor best_bboxes,\n                                        const float spatial_scale,\n                                        const int points, Tensor output) {\n  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,\n                       best_bboxes, spatial_scale, points, output);\n}\n\nvoid rotated_feature_align_backward_impl(const Tensor top_grad,\n                                         const Tensor best_bboxes,\n                                         const float spatial_scale,\n                                         const int points, Tensor bottom_grad) {\n  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,\n                       best_bboxes, spatial_scale, points, bottom_grad);\n}\n\nvoid rotated_feature_align_forward(const Tensor features,\n                                   const Tensor best_bboxes, Tensor output,\n                                   const float spatial_scale,\n                                   const int points) {\n  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,\n                                     points, output);\n}\n\nvoid rotated_feature_align_backward(const Tensor top_grad,\n                                    const Tensor best_bboxes,\n                                    Tensor bottom_grad,\n                                    const float spatial_scale,\n                                    const int points) {\n  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,\n                                      points, bottom_grad);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/scatter_points.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntypedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const reduce_t reduce_type) {\n  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,\n                              reduce_type);\n}\n\nvoid dynamic_point_to_voxel_backward_impl(\n    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,\n    const torch::Tensor &feats, const torch::Tensor &reduced_feats,\n    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,\n    const reduce_t reduce_type) {\n  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,\n                       grad_reduced_feats, feats, reduced_feats, coors_idx,\n                       reduce_count, reduce_type);\n}\n\ninline reduce_t convert_reduce_type(const std::string &reduce_type) {\n  if (reduce_type == \"max\")\n    return reduce_t::MAX;\n  else if (reduce_type == \"sum\")\n    return reduce_t::SUM;\n  else if (reduce_type == \"mean\")\n    return reduce_t::MEAN;\n  else\n    TORCH_CHECK(false, \"do not support reduce type \" + reduce_type)\n  return reduce_t::SUM;\n}\n\nstd::vector<torch::Tensor> dynamic_point_to_voxel_forward(\n    const torch::Tensor &feats, const torch::Tensor &coors,\n    const std::string &reduce_type) {\n  return dynamic_point_to_voxel_forward_impl(feats, coors,\n                                             convert_reduce_type(reduce_type));\n}\n\nvoid dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,\n                                     const torch::Tensor &grad_reduced_feats,\n                                     const torch::Tensor &feats,\n                                     const torch::Tensor &reduced_feats,\n                                     const torch::Tensor &coors_idx,\n                                     const torch::Tensor &reduce_count,\n                                     const std::string &reduce_type) {\n  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,\n                                       reduced_feats, coors_idx, reduce_count,\n                                       convert_reduce_type(reduce_type));\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor indice_maxpool_forward_impl(torch::Tensor features,\n                                          torch::Tensor indicePairs,\n                                          torch::Tensor indiceNum,\n                                          int64_t numAct) {\n  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,\n                              indicePairs, indiceNum, numAct);\n}\n\ntorch::Tensor indice_maxpool_forward(torch::Tensor features,\n                                     torch::Tensor indicePairs,\n                                     torch::Tensor indiceNum, int64_t numAct) {\n  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);\n}\n\ntorch::Tensor indice_maxpool_backward_impl(torch::Tensor features,\n                                           torch::Tensor outFeatures,\n                                           torch::Tensor outGrad,\n                                           torch::Tensor indicePairs,\n                                           torch::Tensor indiceNum) {\n  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,\n                              outFeatures, outGrad, indicePairs, indiceNum);\n}\n\ntorch::Tensor indice_maxpool_backward(torch::Tensor features,\n                                      torch::Tensor outFeatures,\n                                      torch::Tensor outGrad,\n                                      torch::Tensor indicePairs,\n                                      torch::Tensor indiceNum) {\n  return indice_maxpool_backward_impl(features, outFeatures, outGrad,\n                                      indicePairs, indiceNum);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/spconv_ops.cpp",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_forward_cuda(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(\n      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n      padding, dilation, outPadding, _subM, _transpose);\n};\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_forward_mlu(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  return GetIndicePairsForwardMLUKernelLauncher<NDim>(\n      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n      padding, dilation, outPadding, _subM, _transpose);\n}\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsForwardMUSAKernelLauncher(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_forward_musa(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  return GetIndicePairsForwardMUSAKernelLauncher<NDim>(\n      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n      padding, dilation, outPadding, _subM, _transpose);\n};\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_backward_cuda(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(\n      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,\n      stride, padding, dilation, outPadding, _subM, _transpose);\n};\n\n#ifdef MMCV_WITH_MUSA\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> GetIndicePairsBackwardMUSAKernelLauncher(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_backward_musa(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  return GetIndicePairsBackwardMUSAKernelLauncher<NDim>(\n      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,\n      stride, padding, dilation, outPadding, _subM, _transpose);\n};\n#endif\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_forward(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  if (indices.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(indices);\n\n    return get_indice_pairs_forward_cuda<NDim>(\n        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n        padding, dilation, outPadding, _subM, _transpose);\n#else\n    AT_ERROR(\"get_indice_pairs is not compiled with GPU support\");\n#endif\n#ifdef MMCV_WITH_MLU\n  } else if (indices.device().type() == at::kMLU) {\n    return get_indice_pairs_forward_mlu<NDim>(\n        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n        padding, dilation, outPadding, _subM, _transpose);\n#endif\n#ifdef MMCV_WITH_MUSA\n  } else if (indices.device().type() == at::kMUSA) {\n    return get_indice_pairs_forward_musa<NDim>(\n        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,\n        padding, dilation, outPadding, _subM, _transpose);\n#endif\n  } else {\n    AT_ERROR(\"get_indice_pairs is not implemented on CPU\");\n  }\n}\n\ntemplate <unsigned NDim>\nstd::vector<torch::Tensor> get_indice_pairs_backward(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {\n  if (indices.device().is_cuda()) {\n#ifdef MMCV_WITH_CUDA\n    CHECK_CUDA_INPUT(indices);\n    CHECK_CUDA_INPUT(gridOut);\n\n    return get_indice_pairs_backward_cuda<NDim>(\n        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,\n        stride, padding, dilation, outPadding, _subM, _transpose);\n#else\n    AT_ERROR(\"get_indice_pairs is not compiled with GPU support\");\n#endif\n  } else {\n#ifdef MMCV_WITH_MUSA\n    if (indices.device().type() == at::kMUSA) {\n      CHECK_MUSA_INPUT(indices);\n      CHECK_MUSA_INPUT(gridOut);\n      return get_indice_pairs_backward_musa<NDim>(\n          indices, gridOut, batchSize, outSpatialShape, spatialShape,\n          kernelSize, stride, padding, dilation, outPadding, _subM, _transpose);\n    }\n#endif\n    AT_ERROR(\"get_indice_pairs is not implemented on CPU\");\n  }\n}\n\ntorch::Tensor indice_conv_forward_impl(torch::Tensor features,\n                                       torch::Tensor filters,\n                                       torch::Tensor indicePairs,\n                                       torch::Tensor indiceNum,\n                                       int64_t numActOut, int64_t _inverse,\n                                       int64_t _subM) {\n  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,\n                              indicePairs, indiceNum, numActOut, _inverse,\n                              _subM);\n}\n\ntorch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,\n                                  torch::Tensor indicePairs,\n                                  torch::Tensor indiceNum, int64_t numActOut,\n                                  int64_t _inverse, int64_t _subM) {\n  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,\n                                  numActOut, _inverse, _subM);\n}\n\nstd::vector<torch::Tensor> indice_conv_backward_impl(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,\n                              outGrad, indicePairs, indiceNum, _inverse, _subM);\n}\n\nstd::vector<torch::Tensor> indice_conv_backward(\n    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,\n    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,\n    int64_t _subM) {\n  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,\n                                   indiceNum, _inverse, _subM);\n}\n\ntemplate std::vector<torch::Tensor> get_indice_pairs_forward<2>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> get_indice_pairs_forward<3>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> get_indice_pairs_forward<4>(\n    torch::Tensor indices, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> get_indice_pairs_backward<2>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n\ntemplate std::vector<torch::Tensor> get_indice_pairs_backward<3>(\n    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,\n    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,\n    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,\n    std::vector<int64_t> padding, std::vector<int64_t> dilation,\n    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/spconv_utils.h",
    "content": "// Copyright 2019 Yan Yan\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n#include <ATen/ATen.h>\n#ifdef MMCV_WITH_MUSA\n#include \"pytorch_musa_helper.hpp\"\n#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"\n#else\n#include <ATen/cuda/CUDAContext.h>\n\n#include \"pytorch_cuda_helper.hpp\"\n#endif\n#include <torch/script.h>\n#include <utils/spconv/tensorview/tensorview.h>\n\nnamespace tv {\n#ifdef MMCV_WITH_MUSA\nstruct GPU {\n  GPU(musaStream_t s = 0) : mStream(s) {}\n  virtual musaStream_t getStream() const { return mStream; }\n  musaStream_t mStream = 0;\n};\n\nstruct TorchGPU : public tv::GPU {\n  virtual musaStream_t getStream() const override {\n    return at::musa::getCurrentMUSAStream();\n  }\n};\n\n#else\nstruct GPU {\n  GPU(cudaStream_t s = 0) : mStream(s) {}\n  virtual cudaStream_t getStream() const { return mStream; }\n  cudaStream_t mStream = 0;\n};\n\nstruct TorchGPU : public tv::GPU {\n  virtual cudaStream_t getStream() const override {\n    return at::cuda::getCurrentCUDAStream();\n  }\n};\n#endif\n\ntemplate <typename scalar_t>\nvoid check_torch_dtype(const torch::Tensor &tensor) {\n  switch (tensor.scalar_type()) {\n    case at::ScalarType::Double: {\n      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;\n      TV_ASSERT_RT_ERR(val, \"error\");\n      break;\n    }\n    case at::ScalarType::Float: {\n      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;\n      TV_ASSERT_RT_ERR(val, \"error\");\n      break;\n    }\n    case at::ScalarType::Int: {\n      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;\n      TV_ASSERT_RT_ERR(val, \"error\");\n      break;\n    }\n    case at::ScalarType::Half: {\n      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;\n      TV_ASSERT_RT_ERR(val, \"error\");\n      break;\n    }\n    case at::ScalarType::Long: {\n      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;\n      TV_ASSERT_RT_ERR(val, \"error\");\n      break;\n    }\n    default:\n      TV_ASSERT_RT_ERR(false, \"error\");\n  }\n}\n\ntemplate <typename scalar_t>\ntv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {\n  check_torch_dtype<scalar_t>(tensor);\n  tv::Shape shape;\n  for (auto i : tensor.sizes()) {\n    shape.push_back(i);\n  }\n  return tv::TensorView<scalar_t>(\n      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);\n}\n}  // namespace tv\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/sync_bn.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);\n}\n\nvoid sync_bn_forward_var_impl(const Tensor input, const Tensor mean,\n                              Tensor var) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);\n}\n\nvoid sync_bn_forward_output_impl(const Tensor input, const Tensor mean,\n                                 const Tensor var, Tensor running_mean,\n                                 Tensor running_var, const Tensor weight,\n                                 const Tensor bias, Tensor norm, Tensor std,\n                                 Tensor output, float eps, float momentum,\n                                 int group_size) {\n  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,\n                       running_mean, running_var, weight, bias, norm, std,\n                       output, eps, momentum, group_size);\n}\n\nvoid sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,\n                                 Tensor grad_weight, Tensor grad_bias) {\n  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,\n                       grad_weight, grad_bias);\n}\n\nvoid sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,\n                                const Tensor grad_weight,\n                                const Tensor grad_bias, const Tensor norm,\n                                const Tensor std, Tensor grad_input) {\n  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,\n                       grad_weight, grad_bias, norm, std, grad_input);\n}\n\nvoid sync_bn_forward_mean(const Tensor input, Tensor mean) {\n  sync_bn_forward_mean_impl(input, mean);\n}\n\nvoid sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {\n  sync_bn_forward_var_impl(input, mean, var);\n}\n\nvoid sync_bn_forward_output(const Tensor input, const Tensor mean,\n                            const Tensor var, const Tensor weight,\n                            const Tensor bias, Tensor running_mean,\n                            Tensor running_var, Tensor norm, Tensor std,\n                            Tensor output, float eps, float momentum,\n                            int group_size) {\n  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,\n                              weight, bias, norm, std, output, eps, momentum,\n                              group_size);\n}\n\nvoid sync_bn_backward_param(const Tensor grad_output, const Tensor norm,\n                            Tensor grad_weight, Tensor grad_bias) {\n  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);\n}\n\nvoid sync_bn_backward_data(const Tensor grad_output, const Tensor weight,\n                           const Tensor grad_weight, const Tensor grad_bias,\n                           const Tensor norm, const Tensor std,\n                           Tensor grad_input) {\n  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,\n                             std, grad_input);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/three_interpolate.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid three_interpolate_forward_impl(int b, int c, int m, int n,\n                                    const Tensor points, const Tensor idx,\n                                    const Tensor weight, Tensor out) {\n  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,\n                       weight, out);\n}\n\nvoid three_interpolate_backward_impl(int b, int c, int n, int m,\n                                     const Tensor grad_out, const Tensor idx,\n                                     const Tensor weight, Tensor grad_points) {\n  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,\n                       idx, weight, grad_points);\n}\n\nvoid three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,\n                               Tensor weight_tensor, Tensor out_tensor, int b,\n                               int c, int m, int n) {\n  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,\n                                 weight_tensor, out_tensor);\n}\n\nvoid three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,\n                                Tensor weight_tensor, Tensor grad_points_tensor,\n                                int b, int c, int n, int m) {\n  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,\n                                  weight_tensor, grad_points_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/three_nn.cpp",
    "content": "// Modified from\n// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid three_nn_forward_impl(int b, int n, int m, const Tensor unknown,\n                           const Tensor known, Tensor dist2, Tensor idx) {\n  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,\n                       idx);\n}\n\nvoid three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,\n                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,\n                      int m) {\n  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,\n                        idx_tensor);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/tin_shift.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\nvoid tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {\n  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);\n}\n\nvoid tin_shift_backward_impl(Tensor grad_output, Tensor shift,\n                             Tensor grad_input) {\n  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);\n}\n\nvoid tin_shift_forward(Tensor input, Tensor shift, Tensor output) {\n  tin_shift_forward_impl(input, shift, output);\n}\n\nvoid tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {\n  tin_shift_backward_impl(grad_output, shift, grad_input);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/upfirdn2d.cpp",
    "content": "// Modified from\n// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp\n\n/*\nCopyright (c) 2021, NVIDIA Corporation. All rights reserved.\n\nNVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator\nAugmentation (ADA)\n=======================================================================\n\n1. Definitions\n\n\"Licensor\" means any person or entity that distributes its Work.\n\n\"Software\" means the original work of authorship made available under\nthis License.\n\n\"Work\" means the Software and any additions to or derivative works of\nthe Software that are made available under this License.\n\nThe terms \"reproduce,\" \"reproduction,\" \"derivative works,\" and\n\"distribution\" have the meaning as provided under U.S. copyright law;\nprovided, however, that for the purposes of this License, derivative\nworks shall not include works that remain separable from, or merely\nlink (or bind by name) to the interfaces of, the Work.\n\nWorks, including the Software, are \"made available\" under this License\nby including in or with the Work either (a) a copyright notice\nreferencing the applicability of this License to the Work, or (b) a\ncopy of this License.\n\n2. License Grants\n\n    2.1 Copyright Grant. Subject to the terms and conditions of this\n    License, each Licensor grants to you a perpetual, worldwide,\n    non-exclusive, royalty-free, copyright license to reproduce,\n    prepare derivative works of, publicly display, publicly perform,\n    sublicense and distribute its Work and any resulting derivative\n    works in any form.\n\n3. Limitations\n\n    3.1 Redistribution. You may reproduce or distribute the Work only\n    if (a) you do so under this License, (b) you include a complete\n    copy of this License with your distribution, and (c) you retain\n    without modification any copyright, patent, trademark, or\n    attribution notices that are present in the Work.\n\n    3.2 Derivative Works. You may specify that additional or different\n    terms apply to the use, reproduction, and distribution of your\n    derivative works of the Work (\"Your Terms\") only if (a) Your Terms\n    provide that the use limitation in Section 3.3 applies to your\n    derivative works, and (b) you identify the specific derivative\n    works that are subject to Your Terms. Notwithstanding Your Terms,\n    this License (including the redistribution requirements in Section\n    3.1) will continue to apply to the Work itself.\n\n    3.3 Use Limitation. The Work and any derivative works thereof only\n    may be used or intended for use non-commercially. Notwithstanding\n    the foregoing, NVIDIA and its affiliates may use the Work and any\n    derivative works commercially. As used herein, \"non-commercially\"\n    means for research or evaluation purposes only.\n\n    3.4 Patent Claims. If you bring or threaten to bring a patent claim\n    against any Licensor (including any claim, cross-claim or\n    counterclaim in a lawsuit) to enforce any patents that you allege\n    are infringed by any Work, then your rights under this License from\n    such Licensor (including the grant in Section 2.1) will terminate\n    immediately.\n\n    3.5 Trademarks. This License does not grant any rights to use any\n    Licensor’s or its affiliates’ names, logos, or trademarks, except\n    as necessary to reproduce the notices described in this License.\n\n    3.6 Termination. If you violate any term of this License, then your\n    rights under this License (including the grant in Section 2.1) will\n    terminate immediately.\n\n4. Disclaimer of Warranty.\n\nTHE WORK IS PROVIDED \"AS IS\" WITHOUT WARRANTIES OR CONDITIONS OF ANY\nKIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR\nNON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER\nTHIS LICENSE.\n\n5. Limitation of Liability.\n\nEXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL\nTHEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE\nSHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,\nINDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF\nOR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK\n(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,\nLOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER\nCOMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF\nTHE POSSIBILITY OF SUCH DAMAGES.\n\n=======================================================================\n*/\n\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n\ntorch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,\n                                int upx, int upy, int downx, int downy,\n                                int padx0, int padx1, int pady0, int pady1,\n                                bool flip, float gain) {\n  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, filter, upx, upy, downx,\n                              downy, padx0, padx1, pady0, pady1, flip, gain);\n}\n\ntorch::Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx,\n                        int upy, int downx, int downy, int padx0, int padx1,\n                        int pady0, int pady1, bool flip, float gain) {\n  return upfirdn2d_op_impl(input, filter, upx, upy, downx, downy, padx0, padx1,\n                           pady0, pady1, flip, gain);\n}\n"
  },
  {
    "path": "mmcv/ops/csrc/pytorch/voxelization.cpp",
    "content": "// Copyright (c) OpenMMLab. All rights reserved.\n#include \"pytorch_cpp_helper.hpp\"\n#include \"pytorch_device_registry.hpp\"\n#ifdef MMCV_WITH_DIOPI\n#include <diopi/diopirt.h>\n#include <diopi/functions.h>\n#include <diopi/functions_mmcv.h>\n#include <torch/csrc/utils/pybind.h>\n\n#include \"csrc_dipu/diopirt/diopirt_impl.h\"\n#include \"csrc_dipu/runtime/device/deviceapis.h\"\n#include \"csrc_dipu/utils/helpfunc.hpp\"\n\nusing dipu::VENDOR_TYPE;\nusing dipu::diopi_helper::toDiopiScalar;\nusing dipu::diopi_helper::toDiopiTensorHandle;\n#endif\n\nint hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,\n                               at::Tensor &coors,\n                               at::Tensor &num_points_per_voxel,\n                               const std::vector<float> voxel_size,\n                               const std::vector<float> coors_range,\n                               const int max_points, const int max_voxels,\n                               const int NDim = 3) {\n  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,\n                              num_points_per_voxel, voxel_size, coors_range,\n                              max_points, max_voxels, NDim);\n}\n\nint nondeterministic_hard_voxelize_forward_impl(\n    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,\n    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,\n    const std::vector<float> coors_range, const int max_points,\n    const int max_voxels, const int NDim = 3) {\n  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,\n                              points, voxels, coors, num_points_per_voxel,\n                              voxel_size, coors_range, max_points, max_voxels,\n                              NDim);\n}\n\nvoid dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,\n                                   const std::vector<float> voxel_size,\n                                   const std::vector<float> coors_range,\n                                   const int NDim = 3) {\n  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,\n                       coors_range, NDim);\n}\n\n#ifdef MMCV_WITH_DIOPI\nvoid hard_voxelize_forward_diopi(const at::Tensor &points,\n                                 const at::Tensor &voxel_size,\n                                 const at::Tensor &coors_range,\n                                 at::Tensor &voxels, at::Tensor &coors,\n                                 at::Tensor &num_points_per_voxel,\n                                 at::Tensor &voxel_num, const int max_points,\n                                 const int max_voxels, const int NDim = 3,\n                                 const bool deterministic = true) {\n  auto points_p = toDiopiTensorHandle(points);\n  diopiDevice_t device;\n  diopiGetTensorDevice(points_p, &device);\n  if (device == diopi_host) {\n    int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();\n    std::vector<float> voxel_size_v(\n        voxel_size.data_ptr<float>(),\n        voxel_size.data_ptr<float>() + voxel_size.numel());\n    std::vector<float> coors_range_v(\n        coors_range.data_ptr<float>(),\n        coors_range.data_ptr<float>() + coors_range.numel());\n\n    if (deterministic) {\n      *voxel_num_data = hard_voxelize_forward_impl(\n          points, voxels, coors, num_points_per_voxel, voxel_size_v,\n          coors_range_v, max_points, max_voxels, NDim);\n    } else {\n      TORCH_CHECK(\n          deterministic,\n          \"nondeterministic hard_voxelize_forward is not supported on host!\");\n    }\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto voxel_size_p = toDiopiTensorHandle(voxel_size);\n  auto coors_range_p = toDiopiTensorHandle(coors_range);\n  auto voxels_p = toDiopiTensorHandle(voxels);\n  auto coors_p = toDiopiTensorHandle(coors);\n  auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);\n  auto voxel_num_p = toDiopiTensorHandle(voxel_num);\n  if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiHardVoxelizeMmcv(\n          ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,\n          voxel_size_p, coors_range_p, max_points, max_voxels, NDim,\n          deterministic);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiHardVoxelizeMmcv(\n          ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,\n          voxel_size_p, coors_range_p, max_points, max_voxels, NDim,\n          deterministic);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op hard_voxelize_forward\";\n  auto points_cpu = points.cpu();\n  auto voxel_size_cpu = voxel_size.cpu();\n  auto coors_range_cpu = coors_range.cpu();\n  auto voxels_cpu = voxels.cpu();\n  auto coors_cpu = coors.cpu();\n  auto num_points_per_voxel_cpu = num_points_per_voxel.cpu();\n  auto voxel_num_cpu = voxel_num.cpu();\n\n  int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr<int64_t>();\n  std::vector<float> voxel_size_v_cpu(\n      voxel_size_cpu.data_ptr<float>(),\n      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());\n  std::vector<float> coors_range_v_cpu(\n      coors_range_cpu.data_ptr<float>(),\n      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());\n\n  if (deterministic) {\n    *voxel_num_data_cpu = hard_voxelize_forward_impl(\n        points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu,\n        voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim);\n  } else {\n    puts(\"nondeterministic hard_voxelize_forward is not supported on host!\");\n    abort();\n  }\n  voxels.copy_(voxels_cpu);\n  coors.copy_(coors_cpu);\n  num_points_per_voxel.copy_(num_points_per_voxel_cpu);\n  voxel_num.copy_(voxel_num_cpu);\n  return;\n}\n\nvoid dynamic_voxelize_forward_diopi(const at::Tensor &points,\n                                    const at::Tensor &voxel_size,\n                                    const at::Tensor &coors_range,\n                                    at::Tensor &coors, const int NDim = 3) {\n  auto points_p = toDiopiTensorHandle(points);\n  diopiDevice_t device;\n  diopiGetTensorDevice(points_p, &device);\n  if (device == diopi_host) {\n    std::vector<float> voxel_size_v(\n        voxel_size.data_ptr<float>(),\n        voxel_size.data_ptr<float>() + voxel_size.numel());\n    std::vector<float> coors_range_v(\n        coors_range.data_ptr<float>(),\n        coors_range.data_ptr<float>() + coors_range.numel());\n    dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,\n                                  NDim);\n    return;\n  }\n  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());\n  diopiContextHandle_t ch = &ctx;\n  auto voxel_size_p = toDiopiTensorHandle(voxel_size);\n  auto coors_range_p = toDiopiTensorHandle(coors_range);\n  auto coors_p = toDiopiTensorHandle(coors);\n  if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {\n    if (strcmp(dipu::VendorTypeToStr(VENDOR_TYPE), \"NPU\") == 0) {\n      pybind11::gil_scoped_release no_gil;\n      auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,\n                                          coors_range_p, NDim);\n      if (ret == diopiSuccess) return;\n    } else {\n      auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,\n                                          coors_range_p, NDim);\n      if (ret == diopiSuccess) return;\n    }\n  }\n  LOG(WARNING) << \"Fallback to cpu: mmcv ext op dynamic_voxelize_forward\";\n  auto points_cpu = points.cpu();\n  auto voxel_size_cpu = voxel_size.cpu();\n  auto coors_range_cpu = coors_range.cpu();\n  auto coors_cpu = coors.cpu();\n\n  std::vector<float> voxel_size_v_cpu(\n      voxel_size_cpu.data_ptr<float>(),\n      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());\n  std::vector<float> coors_range_v_cpu(\n      coors_range_cpu.data_ptr<float>(),\n      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());\n  dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu,\n                                coors_range_v_cpu, NDim);\n  coors.copy_(coors_cpu);\n  return;\n}\n#endif\n\nvoid hard_voxelize_forward(const at::Tensor &points,\n                           const at::Tensor &voxel_size,\n                           const at::Tensor &coors_range, at::Tensor &voxels,\n                           at::Tensor &coors, at::Tensor &num_points_per_voxel,\n                           at::Tensor &voxel_num, const int max_points,\n                           const int max_voxels, const int NDim = 3,\n                           const bool deterministic = true) {\n#ifdef MMCV_WITH_DIOPI\n  hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors,\n                              num_points_per_voxel, voxel_num, max_points,\n                              max_voxels, NDim, deterministic);\n#else\n  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();\n  std::vector<float> voxel_size_v(\n      voxel_size.data_ptr<float>(),\n      voxel_size.data_ptr<float>() + voxel_size.numel());\n  std::vector<float> coors_range_v(\n      coors_range.data_ptr<float>(),\n      coors_range.data_ptr<float>() + coors_range.numel());\n\n  if (deterministic) {\n    *voxel_num_data = hard_voxelize_forward_impl(\n        points, voxels, coors, num_points_per_voxel, voxel_size_v,\n        coors_range_v, max_points, max_voxels, NDim);\n  } else {\n    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(\n        points, voxels, coors, num_points_per_voxel, voxel_size_v,\n        coors_range_v, max_points, max_voxels, NDim);\n  }\n#endif\n}\n\nvoid dynamic_voxelize_forward(const at::Tensor &points,\n                              const at::Tensor &voxel_size,\n                              const at::Tensor &coors_range, at::Tensor &coors,\n                              const int NDim = 3) {\n#ifdef MMCV_WITH_DIOPI\n  dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim);\n#else\n  std::vector<float> voxel_size_v(\n      voxel_size.data_ptr<float>(),\n      voxel_size.data_ptr<float>() + voxel_size.numel());\n  std::vector<float> coors_range_v(\n      coors_range.data_ptr<float>(),\n      coors_range.data_ptr<float>() + coors_range.numel());\n  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,\n                                NDim);\n#endif\n}\n"
  },
  {
    "path": "mmcv/ops/deform_conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.logging import print_log\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import deprecated_api_warning\nfrom torch import Tensor\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair, _single\n\nfrom mmcv.utils import IS_MLU_AVAILABLE\nfrom ..utils import ext_loader\nfrom .modulated_deform_conv import ModulatedDeformConv2dFunction\n\next_module = ext_loader.load_ext('_ext', [\n    'deform_conv_forward', 'deform_conv_backward_input',\n    'deform_conv_backward_parameters'\n])\n\n\nclass DeformConv2dFunction(Function):\n\n    @staticmethod\n    def symbolic(g,\n                 input,\n                 offset,\n                 weight,\n                 stride,\n                 padding,\n                 dilation,\n                 groups,\n                 deform_groups,\n                 bias=False,\n                 im2col_step=32):\n        return g.op(\n            'mmcv::MMCVDeformConv2d',\n            input,\n            offset,\n            weight,\n            stride_i=stride,\n            padding_i=padding,\n            dilation_i=dilation,\n            groups_i=groups,\n            deform_groups_i=deform_groups,\n            bias_i=bias,\n            im2col_step_i=im2col_step)\n\n    @staticmethod\n    def _npu_backward(ctx, grad_output):\n        import torch_npu\n        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \\\n            ctx.saved_tensors\n        grad_input, grad_weight, grad_offset_all, grad_bias = \\\n            torch_npu.npu_deformable_conv2dbk(\n                input_tensor, grad_output, offset_out, weight, offset_all,\n                kernel_size=[weight.shape[2], weight.shape[3]],\n                stride=[1, 1, ctx.stride[0], ctx.stride[1]],\n                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],\n                         ctx.padding[1]],\n                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],\n                groups=ctx.groups, deformable_groups=ctx.deform_groups,\n                modulated=True)\n        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)\n        return grad_input, grad_offset, grad_weight, \\\n            None, None, None, None, None, None, None\n\n    @staticmethod\n    def forward(ctx,\n                input: Tensor,\n                offset: Tensor,\n                weight: Tensor,\n                stride: Union[int, Tuple[int, ...]] = 1,\n                padding: Union[int, Tuple[int, ...]] = 0,\n                dilation: Union[int, Tuple[int, ...]] = 1,\n                groups: int = 1,\n                deform_groups: int = 1,\n                bias: bool = False,\n                im2col_step: int = 32) -> Tensor:\n        if input is not None and input.dim() != 4:\n            raise ValueError(\n                f'Expected 4D tensor as input, got {input.dim()}D tensor \\\n                  instead.')\n        assert bias is False, 'Only support bias is False.'\n        ctx.stride = _pair(stride)\n        ctx.padding = _pair(padding)\n        ctx.dilation = _pair(dilation)\n        ctx.groups = groups\n        ctx.deform_groups = deform_groups\n        ctx.im2col_step = im2col_step\n        ctx.device = input.device.type\n\n        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;\n        # amp won't cast the type of model (float32), but \"offset\" is cast\n        # to float16 by nn.Conv2d automatically, leading to the type\n        # mismatch with input (when it is float32) or weight.\n        # The flag for whether to use fp16 or amp is the type of \"offset\",\n        # we cast weight and input to temporarily support fp16 and amp\n        # whatever the pytorch version is.\n        input = input.type_as(offset)\n        weight = weight.type_as(input)\n        if ctx.device == 'npu':\n            mask_shape, _ = torch.chunk(offset, 2, dim=1)\n            mask = torch.ones_like(mask_shape).to(input.device)\n            bias = input.new_empty(0)\n            output = ModulatedDeformConv2dFunction._npu_forward(\n                ctx, input, offset, mask, weight, bias)\n            return output\n        ctx.save_for_backward(input, offset, weight)\n\n        output = input.new_empty([\n            int(i)\n            for i in DeformConv2dFunction._output_size(ctx, input, weight)\n        ])\n\n        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones\n\n        cur_im2col_step = min(ctx.im2col_step, input.size(0))\n        assert (input.size(0) % cur_im2col_step\n                ) == 0, 'batch size must be divisible by im2col_step'\n        ext_module.deform_conv_forward(\n            input,\n            weight,\n            offset,\n            output,\n            ctx.bufs_[0],\n            ctx.bufs_[1],\n            kW=weight.size(3),\n            kH=weight.size(2),\n            dW=ctx.stride[1],\n            dH=ctx.stride[0],\n            padW=ctx.padding[1],\n            padH=ctx.padding[0],\n            dilationW=ctx.dilation[1],\n            dilationH=ctx.dilation[0],\n            group=ctx.groups,\n            deformable_group=ctx.deform_groups,\n            im2col_step=cur_im2col_step)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(\n        ctx, grad_output: Tensor\n    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,\n               None, None, None, None, None, None]:\n        if ctx.device == 'npu':\n            return DeformConv2dFunction._npu_backward(ctx, grad_output)\n        input, offset, weight = ctx.saved_tensors\n\n        grad_input = grad_offset = grad_weight = None\n\n        cur_im2col_step = min(ctx.im2col_step, input.size(0))\n        assert (input.size(0) % cur_im2col_step\n                ) == 0, 'batch size must be divisible by im2col_step'\n\n        grad_output = grad_output.contiguous()\n        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:\n            grad_input = torch.zeros_like(input)\n            grad_offset = torch.zeros_like(offset)\n            ext_module.deform_conv_backward_input(\n                input,\n                offset,\n                grad_output,\n                grad_input,\n                grad_offset,\n                weight,\n                ctx.bufs_[0],\n                kW=weight.size(3),\n                kH=weight.size(2),\n                dW=ctx.stride[1],\n                dH=ctx.stride[0],\n                padW=ctx.padding[1],\n                padH=ctx.padding[0],\n                dilationW=ctx.dilation[1],\n                dilationH=ctx.dilation[0],\n                group=ctx.groups,\n                deformable_group=ctx.deform_groups,\n                im2col_step=cur_im2col_step)\n\n        if ctx.needs_input_grad[2]:\n            grad_weight = torch.zeros_like(weight)\n            ext_module.deform_conv_backward_parameters(\n                input,\n                offset,\n                grad_output,\n                grad_weight,\n                ctx.bufs_[0],\n                ctx.bufs_[1],\n                kW=weight.size(3),\n                kH=weight.size(2),\n                dW=ctx.stride[1],\n                dH=ctx.stride[0],\n                padW=ctx.padding[1],\n                padH=ctx.padding[0],\n                dilationW=ctx.dilation[1],\n                dilationH=ctx.dilation[0],\n                group=ctx.groups,\n                deformable_group=ctx.deform_groups,\n                scale=1,\n                im2col_step=cur_im2col_step)\n\n        return grad_input, grad_offset, grad_weight, \\\n            None, None, None, None, None, None, None\n\n    @staticmethod\n    def _output_size(ctx, input, weight):\n        channels = weight.size(0)\n        output_size = (input.size(0), channels)\n        for d in range(input.dim() - 2):\n            in_size = input.size(d + 2)\n            pad = ctx.padding[d]\n            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1\n            stride_ = ctx.stride[d]\n            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )\n        if not all(map(lambda s: s > 0, output_size)):\n            raise ValueError(\n                'convolution input is too small (output would be ' +\n                'x'.join(map(str, output_size)) + ')')\n        return output_size\n\n\ndeform_conv2d = DeformConv2dFunction.apply\n\n\nclass DeformConv2d(nn.Module):\n    r\"\"\"Deformable 2D convolution.\n\n    Applies a deformable 2D convolution over an input signal composed of\n    several input planes. DeformConv2d was described in the paper\n    `Deformable Convolutional Networks\n    <https://arxiv.org/pdf/1703.06211.pdf>`_\n\n    Note:\n        The argument ``im2col_step`` was added in version 1.3.17, which means\n        number of samples processed by the ``im2col_cuda_kernel`` per call.\n        It enables users to define ``batch_size`` and ``im2col_step`` more\n        flexibly and solved `issue mmcv#1440\n        <https://github.com/open-mmlab/mmcv/issues/1440>`_.\n\n    Args:\n        in_channels (int): Number of channels in the input image.\n        out_channels (int): Number of channels produced by the convolution.\n        kernel_size(int, tuple): Size of the convolving kernel.\n        stride(int, tuple): Stride of the convolution. Default: 1.\n        padding (int or tuple): Zero-padding added to both sides of the input.\n            Default: 0.\n        dilation (int or tuple): Spacing between kernel elements. Default: 1.\n        groups (int): Number of blocked connections from input.\n            channels to output channels. Default: 1.\n        deform_groups (int): Number of deformable group partitions.\n        bias (bool): If True, adds a learnable bias to the output.\n            Default: False.\n        im2col_step (int): Number of samples processed by im2col_cuda_kernel\n            per call. It will work when ``batch_size`` > ``im2col_step``, but\n            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.\n            `New in version 1.3.17.`\n    \"\"\"\n\n    @deprecated_api_warning({'deformable_groups': 'deform_groups'},\n                            cls_name='DeformConv2d')\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, ...]],\n                 stride: Union[int, Tuple[int, ...]] = 1,\n                 padding: Union[int, Tuple[int, ...]] = 0,\n                 dilation: Union[int, Tuple[int, ...]] = 1,\n                 groups: int = 1,\n                 deform_groups: int = 1,\n                 bias: bool = False,\n                 im2col_step: int = 32) -> None:\n        super().__init__()\n\n        assert not bias, \\\n            f'bias={bias} is not supported in DeformConv2d.'\n        assert in_channels % groups == 0, \\\n            f'in_channels {in_channels} cannot be divisible by groups {groups}'\n        assert out_channels % groups == 0, \\\n            f'out_channels {out_channels} cannot be divisible by groups \\\n              {groups}'\n\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.kernel_size = _pair(kernel_size)\n        self.stride = _pair(stride)\n        self.padding = _pair(padding)\n        self.dilation = _pair(dilation)\n        self.groups = groups\n        self.deform_groups = deform_groups\n        self.im2col_step = im2col_step\n        # enable compatibility with nn.Conv2d\n        self.transposed = False\n        self.output_padding = _single(0)\n\n        # only weight, no bias\n        self.weight = nn.Parameter(\n            torch.Tensor(out_channels, in_channels // self.groups,\n                         *self.kernel_size))\n\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        # switch the initialization of `self.weight` to the standard kaiming\n        # method described in `Delving deep into rectifiers: Surpassing\n        # human-level performance on ImageNet classification` - He, K. et al.\n        # (2015), using a uniform distribution\n        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')\n\n    def forward(self, x: Tensor, offset: Tensor) -> Tensor:\n        \"\"\"Deformable Convolutional forward function.\n\n        Args:\n            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)\n            offset (Tensor): Offset for deformable convolution, shape\n                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,\n                H_out, W_out), H_out, W_out are equal to the output's.\n\n                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.\n                The spatial arrangement is like:\n\n                .. code:: text\n\n                    (x0, y0) (x1, y1) (x2, y2)\n                    (x3, y3) (x4, y4) (x5, y5)\n                    (x6, y6) (x7, y7) (x8, y8)\n\n        Returns:\n            Tensor: Output of the layer.\n        \"\"\"\n        # To fix an assert error in deform_conv_cuda.cpp:128\n        # input image is smaller than kernel\n        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <\n                                                          self.kernel_size[1])\n        if input_pad:\n            pad_h = max(self.kernel_size[0] - x.size(2), 0)\n            pad_w = max(self.kernel_size[1] - x.size(3), 0)\n            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()\n            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)\n            offset = offset.contiguous()\n        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,\n                            self.dilation, self.groups, self.deform_groups,\n                            False, self.im2col_step)\n        if input_pad:\n            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -\n                      pad_w].contiguous()\n        return out\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(in_channels={self.in_channels},\\n'\n        s += f'out_channels={self.out_channels},\\n'\n        s += f'kernel_size={self.kernel_size},\\n'\n        s += f'stride={self.stride},\\n'\n        s += f'padding={self.padding},\\n'\n        s += f'dilation={self.dilation},\\n'\n        s += f'groups={self.groups},\\n'\n        s += f'deform_groups={self.deform_groups},\\n'\n        # bias is not supported in DeformConv2d.\n        s += 'bias=False)'\n        return s\n\n\n@MODELS.register_module('DCN')\nclass DeformConv2dPack(DeformConv2d):\n    \"\"\"A Deformable Conv Encapsulation that acts as normal Conv layers.\n\n    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.\n    The spatial arrangement is like:\n\n    .. code:: text\n\n        (x0, y0) (x1, y1) (x2, y2)\n        (x3, y3) (x4, y4) (x5, y5)\n        (x6, y6) (x7, y7) (x8, y8)\n\n    Args:\n        in_channels (int): Same as nn.Conv2d.\n        out_channels (int): Same as nn.Conv2d.\n        kernel_size (int or tuple[int]): Same as nn.Conv2d.\n        stride (int or tuple[int]): Same as nn.Conv2d.\n        padding (int or tuple[int]): Same as nn.Conv2d.\n        dilation (int or tuple[int]): Same as nn.Conv2d.\n        groups (int): Same as nn.Conv2d.\n        bias (bool or str): If specified as `auto`, it will be decided by the\n            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise\n            False.\n    \"\"\"\n\n    _version = 2\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.conv_offset = nn.Conv2d(\n            self.in_channels,\n            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],\n            kernel_size=self.kernel_size,\n            stride=_pair(self.stride),\n            padding=_pair(self.padding),\n            dilation=_pair(self.dilation),\n            bias=True)\n        self.init_offset()\n\n    def init_offset(self):\n        self.conv_offset.weight.data.zero_()\n        self.conv_offset.bias.data.zero_()\n\n    def forward(self, x: Tensor) -> Tensor:  # type: ignore\n        offset = self.conv_offset(x)\n        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,\n                             self.dilation, self.groups, self.deform_groups,\n                             False, self.im2col_step)\n\n    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,\n                              missing_keys, unexpected_keys, error_msgs):\n        version = local_metadata.get('version', None)\n\n        if version is None or version < 2:\n            # the key is different in early versions\n            # In version < 2, DeformConvPack loads previous benchmark models.\n            if (prefix + 'conv_offset.weight' not in state_dict\n                    and prefix[:-1] + '_offset.weight' in state_dict):\n                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(\n                    prefix[:-1] + '_offset.weight')\n            if (prefix + 'conv_offset.bias' not in state_dict\n                    and prefix[:-1] + '_offset.bias' in state_dict):\n                state_dict[prefix +\n                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +\n                                                                '_offset.bias')\n\n        if version is not None and version > 1:\n            print_log(\n                f'DeformConv2dPack {prefix.rstrip(\".\")} is upgraded to '\n                'version 2.',\n                logger='current')\n\n        super()._load_from_state_dict(state_dict, prefix, local_metadata,\n                                      strict, missing_keys, unexpected_keys,\n                                      error_msgs)\n\n\nif IS_MLU_AVAILABLE:\n    import torchvision\n    from mmengine.utils import digit_version\n    from torchvision.ops import deform_conv2d as tv_deform_conv2d\n\n    @MODELS.register_module('DCN', force=True)\n    class DeformConv2dPack_MLU(DeformConv2d):\n        \"\"\"This class is the DCN implementation of the MLU device.\n\n        The MLU\n        backend support of the operator has been implemented in torchvision.\n        The mmcv registration mechanism is used for multiplexing here. The\n        torchvision implementation of DCN is called.\n        Args:\n            in_channels (int): Same as nn.Conv2d.\n            out_channels (int): Same as nn.Conv2d.\n            kernel_size (int or tuple[int]): Same as nn.Conv2d.\n            stride (int): Same as nn.Conv2d, while tuple is not supported.\n            padding (int): Same as nn.Conv2d, while tuple is not supported.\n            dilation (int): Same as nn.Conv2d, while tuple is not supported.\n            groups (int): Same as nn.Conv2d.\n            bias (bool or str): If specified as `auto`, it will be decided by\n                the norm_cfg. Bias will be set as True if norm_cfg is None,\n                otherwise False.\n            im2col_step (int): Number of samples processed by\n                im2col_cuda_kernel per call. It will work when ``batch_size``\n                > ``im2col_step``, but ``batch_size`` must be divisible by\n                ``im2col_step``. Default: 32. `New in version 1.7.2.\n                Currently not supported on MLU devices.`\n        \"\"\"\n\n        def __init__(self, *args, **kwargs):\n            assert digit_version(torchvision.__version__) >= digit_version(\n                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'\n            super().__init__(*args, **kwargs)\n\n            self.conv_offset = nn.Conv2d(\n                self.in_channels,\n                self.deform_groups * 2 * self.kernel_size[0] *\n                self.kernel_size[1],\n                kernel_size=self.kernel_size,\n                stride=_pair(self.stride),\n                padding=_pair(self.padding),\n                dilation=_pair(self.dilation),\n                bias=True)\n            self.init_offset()\n\n        def init_offset(self):\n            self.conv_offset.weight.data.zero_()\n            self.conv_offset.bias.data.zero_()\n\n        def forward(self, x: Tensor) -> Tensor:  # type: ignore\n            cur_im2col_step = min(self.im2col_step, x.size(0))\n            assert (x.size(0) % cur_im2col_step\n                    ) == 0, 'batch size must be divisible by im2col_step'\n            offset = self.conv_offset(x)\n            x = x.type_as(offset)\n            weight = self.weight.type_as(x)\n            return tv_deform_conv2d(x, offset, weight, None, self.stride,\n                                    self.padding, self.dilation)\n"
  },
  {
    "path": "mmcv/ops/deform_roi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Tuple\n\nfrom torch import Tensor, nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])\n\n\nclass DeformRoIPoolFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, rois, offset, output_size, spatial_scale,\n                 sampling_ratio, gamma):\n        inputs = [input, rois]\n        if offset is not None:\n            inputs = [input, rois, offset]\n        return g.op(\n            'mmcv::MMCVDeformRoIPool',\n            *inputs,\n            pooled_height_i=output_size[0],\n            pooled_width_i=output_size[1],\n            spatial_scale_f=spatial_scale,\n            sampling_ratio_f=sampling_ratio,\n            gamma_f=gamma,\n        )\n\n    @staticmethod\n    def forward(ctx,\n                input: Tensor,\n                rois: Tensor,\n                offset: Optional[Tensor],\n                output_size: Tuple[int, ...],\n                spatial_scale: float = 1.0,\n                sampling_ratio: int = 0,\n                gamma: float = 0.1) -> Tensor:\n        if offset is None:\n            offset = input.new_zeros(0)\n        ctx.output_size = _pair(output_size)\n        ctx.spatial_scale = float(spatial_scale)\n        ctx.sampling_ratio = int(sampling_ratio)\n        ctx.gamma = float(gamma)\n\n        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'\n\n        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],\n                        ctx.output_size[1])\n        output = input.new_zeros(output_shape)\n\n        ext_module.deform_roi_pool_forward(\n            input,\n            rois,\n            offset,\n            output,\n            pooled_height=ctx.output_size[0],\n            pooled_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            gamma=ctx.gamma)\n\n        ctx.save_for_backward(input, rois, offset)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(\n        ctx, grad_output: Tensor\n    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:\n        input, rois, offset = ctx.saved_tensors\n        grad_input = grad_output.new_zeros(input.shape)\n        grad_offset = grad_output.new_zeros(offset.shape)\n\n        ext_module.deform_roi_pool_backward(\n            grad_output,\n            input,\n            rois,\n            offset,\n            grad_input,\n            grad_offset,\n            pooled_height=ctx.output_size[0],\n            pooled_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            gamma=ctx.gamma)\n        if grad_offset.numel() == 0:\n            grad_offset = None\n        return grad_input, None, grad_offset, None, None, None, None\n\n\ndeform_roi_pool = DeformRoIPoolFunction.apply\n\n\nclass DeformRoIPool(nn.Module):\n\n    def __init__(self,\n                 output_size: Tuple[int, ...],\n                 spatial_scale: float = 1.0,\n                 sampling_ratio: int = 0,\n                 gamma: float = 0.1):\n        super().__init__()\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n        self.sampling_ratio = int(sampling_ratio)\n        self.gamma = float(gamma)\n\n    def forward(self,\n                input: Tensor,\n                rois: Tensor,\n                offset: Optional[Tensor] = None) -> Tensor:\n        return deform_roi_pool(input, rois, offset, self.output_size,\n                               self.spatial_scale, self.sampling_ratio,\n                               self.gamma)\n\n\nclass DeformRoIPoolPack(DeformRoIPool):\n\n    def __init__(self,\n                 output_size: Tuple[int, ...],\n                 output_channels: int,\n                 deform_fc_channels: int = 1024,\n                 spatial_scale: float = 1.0,\n                 sampling_ratio: int = 0,\n                 gamma: float = 0.1):\n        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)\n\n        self.output_channels = output_channels\n        self.deform_fc_channels = deform_fc_channels\n\n        self.offset_fc = nn.Sequential(\n            nn.Linear(\n                self.output_size[0] * self.output_size[1] *\n                self.output_channels, self.deform_fc_channels),\n            nn.ReLU(inplace=True),\n            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),\n            nn.ReLU(inplace=True),\n            nn.Linear(self.deform_fc_channels,\n                      self.output_size[0] * self.output_size[1] * 2))\n        self.offset_fc[-1].weight.data.zero_()\n        self.offset_fc[-1].bias.data.zero_()\n\n    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore\n        assert input.size(1) == self.output_channels\n        x = deform_roi_pool(input, rois, None, self.output_size,\n                            self.spatial_scale, self.sampling_ratio,\n                            self.gamma)\n        rois_num = rois.size(0)\n        offset = self.offset_fc(x.view(rois_num, -1))\n        offset = offset.view(rois_num, 2, self.output_size[0],\n                             self.output_size[1])\n        return deform_roi_pool(input, rois, offset, self.output_size,\n                               self.spatial_scale, self.sampling_ratio,\n                               self.gamma)\n\n\nclass ModulatedDeformRoIPoolPack(DeformRoIPool):\n\n    def __init__(self,\n                 output_size: Tuple[int, ...],\n                 output_channels: int,\n                 deform_fc_channels: int = 1024,\n                 spatial_scale: float = 1.0,\n                 sampling_ratio: int = 0,\n                 gamma: float = 0.1):\n        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)\n\n        self.output_channels = output_channels\n        self.deform_fc_channels = deform_fc_channels\n\n        self.offset_fc = nn.Sequential(\n            nn.Linear(\n                self.output_size[0] * self.output_size[1] *\n                self.output_channels, self.deform_fc_channels),\n            nn.ReLU(inplace=True),\n            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),\n            nn.ReLU(inplace=True),\n            nn.Linear(self.deform_fc_channels,\n                      self.output_size[0] * self.output_size[1] * 2))\n        self.offset_fc[-1].weight.data.zero_()\n        self.offset_fc[-1].bias.data.zero_()\n\n        self.mask_fc = nn.Sequential(\n            nn.Linear(\n                self.output_size[0] * self.output_size[1] *\n                self.output_channels, self.deform_fc_channels),\n            nn.ReLU(inplace=True),\n            nn.Linear(self.deform_fc_channels,\n                      self.output_size[0] * self.output_size[1] * 1),\n            nn.Sigmoid())\n        self.mask_fc[2].weight.data.zero_()\n        self.mask_fc[2].bias.data.zero_()\n\n    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore\n        assert input.size(1) == self.output_channels\n        x = deform_roi_pool(input, rois, None, self.output_size,\n                            self.spatial_scale, self.sampling_ratio,\n                            self.gamma)\n        rois_num = rois.size(0)\n        offset = self.offset_fc(x.view(rois_num, -1))\n        offset = offset.view(rois_num, 2, self.output_size[0],\n                             self.output_size[1])\n        mask = self.mask_fc(x.view(rois_num, -1))\n        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])\n        d = deform_roi_pool(input, rois, offset, self.output_size,\n                            self.spatial_scale, self.sampling_ratio,\n                            self.gamma)\n        return d * mask\n"
  },
  {
    "path": "mmcv/ops/deprecated_wrappers.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n# This file is for backward compatibility.\n# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.\nimport warnings\n\nfrom ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d\n\n\nclass Conv2d_deprecated(Conv2d):\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        warnings.warn(\n            'Importing Conv2d wrapper from \"mmcv.ops\" will be deprecated in'\n            ' the future. Please import them from \"mmcv.cnn\" instead',\n            DeprecationWarning)\n\n\nclass ConvTranspose2d_deprecated(ConvTranspose2d):\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        warnings.warn(\n            'Importing ConvTranspose2d wrapper from \"mmcv.ops\" will be '\n            'deprecated in the future. Please import them from \"mmcv.cnn\" '\n            'instead', DeprecationWarning)\n\n\nclass MaxPool2d_deprecated(MaxPool2d):\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        warnings.warn(\n            'Importing MaxPool2d wrapper from \"mmcv.ops\" will be deprecated in'\n            ' the future. Please import them from \"mmcv.cnn\" instead',\n            DeprecationWarning)\n\n\nclass Linear_deprecated(Linear):\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        warnings.warn(\n            'Importing Linear wrapper from \"mmcv.ops\" will be deprecated in'\n            ' the future. Please import them from \"mmcv.cnn\" instead',\n            DeprecationWarning)\n"
  },
  {
    "path": "mmcv/ops/diff_iou_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa\n# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa\nfrom typing import Tuple\n\nimport torch\nfrom torch import Tensor\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\nEPSILON = 1e-8\next_module = ext_loader.load_ext('_ext',\n                                 ['diff_iou_rotated_sort_vertices_forward'])\n\n\nclass SortVertices(Function):\n\n    @staticmethod\n    def forward(ctx, vertices, mask, num_valid):\n        idx = ext_module.diff_iou_rotated_sort_vertices_forward(\n            vertices, mask, num_valid)\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(idx)\n        return idx\n\n    @staticmethod\n    def backward(ctx, gradout):\n        return ()\n\n\ndef box_intersection(corners1: Tensor,\n                     corners2: Tensor) -> Tuple[Tensor, Tensor]:\n    \"\"\"Find intersection points of rectangles.\n    Convention: if two edges are collinear, there is no intersection point.\n\n    Args:\n        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.\n        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.\n\n    Returns:\n        Tuple:\n         - Tensor: (B, N, 4, 4, 2) Intersections.\n         - Tensor: (B, N, 4, 4) Valid intersections mask.\n    \"\"\"\n    # build edges from corners\n    # B, N, 4, 4: Batch, Box, edge, point\n    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)\n    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)\n    # duplicate data to pair each edges from the boxes\n    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point\n    line1_ext = line1.unsqueeze(3)\n    line2_ext = line2.unsqueeze(2)\n    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)\n    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)\n    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection\n    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)\n    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)\n    t = denumerator_t / numerator\n    t[numerator == .0] = -1.\n    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1\n    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)\n    u = -denumerator_u / numerator\n    u[numerator == .0] = -1.\n    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2\n    mask = mask_t * mask_u\n    # overwrite with EPSILON. otherwise numerically unstable\n    t = denumerator_t / (numerator + EPSILON)\n    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],\n                                dim=-1)\n    intersections = intersections * mask.float().unsqueeze(-1)\n    return intersections, mask\n\n\ndef box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:\n    \"\"\"Check if corners of box1 lie in box2.\n    Convention: if a corner is exactly on the edge of the other box,\n    it's also a valid point.\n\n    Args:\n        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.\n        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.\n\n    Returns:\n        Tensor: (B, N, 4) Intersection.\n    \"\"\"\n    # a, b, c, d - 4 vertices of box2\n    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)\n    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)\n    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)\n    # ab, am, ad - vectors between corresponding vertices\n    ab = b - a  # (B, N, 1, 2)\n    am = corners1 - a  # (B, N, 4, 2)\n    ad = d - a  # (B, N, 1, 2)\n    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)\n    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)\n    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)\n    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)\n    # NOTE: the expression looks ugly but is stable if the two boxes\n    # are exactly the same also stable with different scale of bboxes\n    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6\n                                           )  # (B, N, 4)\n    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6\n                                           )  # (B, N, 4)\n    return cond1 * cond2\n\n\ndef box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:\n    \"\"\"Check if corners of two boxes lie in each other.\n\n    Args:\n        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.\n        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.\n\n    Returns:\n        Tuple:\n         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.\n         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.\n    \"\"\"\n    c1_in_2 = box1_in_box2(corners1, corners2)\n    c2_in_1 = box1_in_box2(corners2, corners1)\n    return c1_in_2, c2_in_1\n\n\ndef build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,\n                   c2_in_1: Tensor, intersections: Tensor,\n                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:\n    \"\"\"Find vertices of intersection area.\n\n    Args:\n        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.\n        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.\n        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.\n        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.\n        intersections (Tensor): (B, N, 4, 4, 2) Intersections.\n        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.\n\n    Returns:\n        Tuple:\n         - Tensor: (B, N, 24, 2) Vertices of intersection area;\n               only some elements are valid.\n         - Tensor: (B, N, 24) Mask of valid elements in vertices.\n    \"\"\"\n    # NOTE: inter has elements equals zero and has zeros gradient\n    # (masked by multiplying with 0); can be used as trick\n    B = corners1.size()[0]\n    N = corners1.size()[1]\n    # (B, N, 4 + 4 + 16, 2)\n    vertices = torch.cat(\n        [corners1, corners2,\n         intersections.view([B, N, -1, 2])], dim=2)\n    # Bool (B, N, 4 + 4 + 16)\n    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)\n    return vertices, mask\n\n\ndef sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:\n    \"\"\"Sort indices.\n    Note:\n        why 9? the polygon has maximal 8 vertices.\n        +1 to duplicate the first element.\n        the index should have following structure:\n            (A, B, C, ... , A, X, X, X)\n        and X indicates the index of arbitrary elements in the last\n        16 (intersections not corners) with value 0 and mask False.\n        (cause they have zero value and zero gradient)\n\n    Args:\n        vertices (Tensor): (B, N, 24, 2) Box vertices.\n        mask (Tensor): (B, N, 24) Mask.\n\n    Returns:\n        Tensor: (B, N, 9) Sorted indices.\n\n    \"\"\"\n    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)\n    mean = torch.sum(\n        vertices * mask.float().unsqueeze(-1), dim=2,\n        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)\n    vertices_normalized = vertices - mean  # normalization makes sorting easier\n    return SortVertices.apply(vertices_normalized, mask, num_valid).long()\n\n\ndef calculate_area(idx_sorted: Tensor,\n                   vertices: Tensor) -> Tuple[Tensor, Tensor]:\n    \"\"\"Calculate area of intersection.\n\n    Args:\n        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.\n        vertices (Tensor): (B, N, 24, 2) Vertices.\n\n    Returns:\n        Tuple:\n         - Tensor (B, N): Area of intersection.\n         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.\n    \"\"\"\n    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])\n    selected = torch.gather(vertices, 2, idx_ext)\n    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \\\n        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]\n    total = torch.sum(total, dim=2)\n    area = torch.abs(total) / 2\n    return area, selected\n\n\ndef oriented_box_intersection_2d(corners1: Tensor,\n                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:\n    \"\"\"Calculate intersection area of 2d rotated boxes.\n\n    Args:\n        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.\n        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.\n\n    Returns:\n        Tuple:\n         - Tensor (B, N): Area of intersection.\n         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.\n    \"\"\"\n    intersections, valid_mask = box_intersection(corners1, corners2)\n    c12, c21 = box_in_box(corners1, corners2)\n    vertices, mask = build_vertices(corners1, corners2, c12, c21,\n                                    intersections, valid_mask)\n    sorted_indices = sort_indices(vertices, mask)\n    return calculate_area(sorted_indices, vertices)\n\n\ndef box2corners(box: Tensor) -> Tensor:\n    \"\"\"Convert rotated 2d box coordinate to corners.\n\n    Args:\n        box (Tensor): (B, N, 5) with x, y, w, h, alpha.\n\n    Returns:\n        Tensor: (B, N, 4, 2) Corners.\n    \"\"\"\n    B = box.size()[0]\n    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)\n    x4 = box.new_tensor([0.5, -0.5, -0.5, 0.5]).to(box.device)\n    x4 = x4 * w  # (B, N, 4)\n    y4 = box.new_tensor([0.5, 0.5, -0.5, -0.5]).to(box.device)\n    y4 = y4 * h  # (B, N, 4)\n    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)\n    sin = torch.sin(alpha)\n    cos = torch.cos(alpha)\n    row1 = torch.cat([cos, sin], dim=-1)\n    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)\n    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)\n    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))\n    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)\n    rotated[..., 0] += x\n    rotated[..., 1] += y\n    return rotated\n\n\ndef diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:\n    \"\"\"Calculate differentiable iou of rotated 2d boxes.\n\n    Args:\n        box1 (Tensor): (B, N, 5) First box.\n        box2 (Tensor): (B, N, 5) Second box.\n\n    Returns:\n        Tensor: (B, N) IoU.\n    \"\"\"\n    corners1 = box2corners(box1)\n    corners2 = box2corners(box2)\n    intersection, _ = oriented_box_intersection_2d(corners1,\n                                                   corners2)  # (B, N)\n    area1 = box1[:, :, 2] * box1[:, :, 3]\n    area2 = box2[:, :, 2] * box2[:, :, 3]\n    union = area1 + area2 - intersection\n    iou = intersection / union\n    return iou\n\n\ndef diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:\n    \"\"\"Calculate differentiable iou of rotated 3d boxes.\n\n    Args:\n        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).\n        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).\n\n    Returns:\n        Tensor: (B, N) IoU.\n    \"\"\"\n    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box\n    box2 = box3d2[..., [0, 1, 3, 4, 6]]\n    corners1 = box2corners(box1)\n    corners2 = box2corners(box2)\n    intersection, _ = oriented_box_intersection_2d(corners1, corners2)\n    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5\n    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5\n    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5\n    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5\n    z_overlap = (torch.min(zmax1, zmax2) -\n                 torch.max(zmin1, zmin2)).clamp_(min=0.)\n    intersection_3d = intersection * z_overlap\n    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]\n    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]\n    union_3d = volume1 + volume2 - intersection_3d\n    return intersection_3d / union_3d\n"
  },
  {
    "path": "mmcv/ops/filtered_lrelu.py",
    "content": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors retain all intellectual property\n# and proprietary rights in and to this software, related documentation\n# and any modifications thereto.  Any use, reproduction, disclosure or\n# distribution of this software and related documentation without an express\n# license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/filtered_lrelu.py # noqa\nimport warnings\nfrom typing import Dict, Optional, Union\n\nimport numpy as np\nimport torch\n\nfrom ..utils import IS_MUSA_AVAILABLE, ext_loader\nfrom .bias_act import bias_act\nfrom .upfirdn2d import _get_filter_size, _parse_padding, upfirdn2d\n\next_module = ext_loader.load_ext('_ext',\n                                 ['filtered_lrelu', 'filtered_lrelu_act_'])\n\n_plugin = None\n\n\ndef filtered_lrelu(input: torch.Tensor,\n                   filter_up: Optional[torch.Tensor] = None,\n                   filter_down: Optional[torch.Tensor] = None,\n                   bias: Optional[torch.Tensor] = None,\n                   up: int = 1,\n                   down: int = 1,\n                   padding: int = 0,\n                   gain: float = np.sqrt(2),\n                   slope: float = 0.2,\n                   clamp: Optional[Union[float, int]] = None,\n                   flip_filter: bool = False,\n                   use_custom_op: bool = True):\n    \"\"\"Filtered leaky ReLU for a batch of 2D images.\n\n    Performs the following sequence of operations for each channel:\n\n    1. Add channel-specific bias if `bias` is provided.\n\n    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).\n\n    3. Pad the image with the specified number of zeros on each side\n      (`padding`). Negative padding corresponds to cropping the image.\n\n    4. Convolve the image with the specified upsampling FIR filter\n        (`filter_up`), shrinking it so that the footprint of all output pixels\n        lies within the input image.\n\n    5. Multiply each value by the provided gain factor (`gain`).\n\n    6. Apply leaky ReLU activation function to each value.\n\n    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is\n       provided.\n\n    8. Convolve the image with the specified downsampling FIR filter\n        (`filter_down`), shrinking it so that the footprint of all output\n        pixels lies within the input image.\n\n    9. Downsample the image by keeping every Nth pixel (`down`).\n\n    The fused op is considerably more efficient than performing the same\n    calculation using standard PyTorch ops. It supports gradients of arbitrary\n    order.\n\n    Args:\n        input (torch.Tensor): Float32/float16/float64 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape\n            `[filter_height, filter_width]` (non-separable), `[filter_taps]`\n            (separable), or `None` (identity). Defaults to None.\n        filter_down (torch.Tensor): Float32 downsampling FIR filter of the\n            shape `[filter_height, filter_width]` (non-separable),\n            `[filter_taps]` (separable), or `None` (identity).\n            Defaults to None.\n        bias (torch.Tensor): Bias vector, or `None` to disable. Must be\n            a 1D tensor of the same type as `input`. The length of vector must\n            match the channel dimension of `input`. Defaults to None.\n        up (int): Integer upsampling factor. Defaults to 1.\n        down (int): Integer downsampling factor. Defaults to 1.\n        padding (int): Padding with respect to the upsampled image. Can be a\n            single number or a list/tuple `[x, y]` or `[x_before, x_after,\n            y_before, y_after]`. Defaults to 0.\n        gain (float): Overall scaling factor for signal magnitude.\n            Defaults to np.sqrt(2).\n        slope (float): Slope on the negative side of leaky ReLU.\n            Defaults to 0.2.\n        clamp (Optional[Union[float, int]]): Maximum magnitude for leaky ReLU\n            output. Defaults to None.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height,\n        out_width]`.\n    \"\"\"\n    assert isinstance(input, torch.Tensor)\n    if use_custom_op and input.is_cuda:\n        return _filtered_lrelu_cuda(\n            up=up,\n            down=down,\n            padding=padding,\n            gain=gain,\n            slope=slope,\n            clamp=clamp,\n            flip_filter=flip_filter).apply(input, filter_up, filter_down, bias,\n                                           None, 0, 0)\n    if use_custom_op and IS_MUSA_AVAILABLE and input.is_musa:\n        # @MTAI there have some bugs\n        input = input.cpu()\n        if bias is not None:\n            bias = bias.cpu()\n        if filter_up is not None:\n            filter_up = filter_up.cpu()\n        if filter_down is not None:\n            filter_down = filter_down.cpu()\n        return _filtered_lrelu_ref(\n            input,\n            filter_up=filter_up,\n            filter_down=filter_down,\n            bias=bias,\n            up=up,\n            down=down,\n            padding=padding,\n            gain=gain,\n            slope=slope,\n            clamp=clamp,\n            flip_filter=flip_filter)\n    return _filtered_lrelu_ref(\n        input,\n        filter_up=filter_up,\n        filter_down=filter_down,\n        bias=bias,\n        up=up,\n        down=down,\n        padding=padding,\n        gain=gain,\n        slope=slope,\n        clamp=clamp,\n        flip_filter=flip_filter)\n\n\ndef _filtered_lrelu_ref(input: torch.Tensor,\n                        filter_up: Optional[torch.Tensor] = None,\n                        filter_down: Optional[torch.Tensor] = None,\n                        bias: Optional[torch.Tensor] = None,\n                        up: int = 1,\n                        down: int = 1,\n                        padding: int = 0,\n                        gain: float = np.sqrt(2),\n                        slope: float = 0.2,\n                        clamp: Optional[Union[float, int]] = None,\n                        flip_filter: bool = False):\n    \"\"\"Slow and memory-inefficient reference implementation of\n    `filtered_lrelu()` using existing `upfirdn2n()` and `bias_act()` ops.\n\n    Args:\n        input (torch.Tensor): Float32/float16/float64 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape\n            `[filter_height, filter_width]` (non-separable), `[filter_taps]`\n            (separable), or `None` (identity). Defaults to None.\n        filter_down (torch.Tensor): Float32 downsampling FIR filter of the\n            shape `[filter_height, filter_width]` (non-separable),\n            `[filter_taps]` (separable), or `None` (identity).\n            Defaults to None.\n        bias (torch.Tensor): Bias vector, or `None` to disable. Must be\n            a 1D tensor of the same type as `input`. The length of vector must\n            match the channel dimension of `input`. Defaults to None.\n        up (int): Integer upsampling factor. Defaults to 1.\n        down (int): Integer downsampling factor. Defaults to 1.\n        padding (int): Padding with respect to the upsampled image. Can be a\n            single number or a list/tuple `[x, y]` or `[x_before, x_after,\n            y_before, y_after]`. Defaults to 0.\n        gain (float): Overall scaling factor for signal magnitude.\n            Defaults to np.sqrt(2).\n        slope (float): Slope on the negative side of leaky ReLU.\n            Defaults to 0.2.\n        clamp (float or int): Maximum magnitude for leaky ReLU\n            output. Defaults to None.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height,\n        out_width]`.\n    \"\"\"\n    assert isinstance(input, torch.Tensor) and input.ndim == 4\n    filter_up_w, filter_up_h = _get_filter_size(filter_up)\n    filter_down_w, filter_down_h = _get_filter_size(filter_down)\n    if bias is not None:\n        assert isinstance(bias, torch.Tensor) and bias.dtype == input.dtype\n    assert isinstance(up, int) and up >= 1\n    assert isinstance(down, int) and down >= 1\n    px0, px1, py0, py1 = _parse_padding(padding)\n    assert gain == float(gain) and gain > 0\n    assert slope == float(slope) and slope >= 0\n    assert clamp is None or (clamp == float(clamp) and clamp >= 0)\n\n    # Calculate output size.\n    batch_size, channels, in_h, in_w = input.shape\n    in_dtype = input.dtype\n    out_w = (in_w * up + (px0 + px1) - (filter_up_w - 1) -\n             (filter_down_w - 1) + (down - 1)) // down\n    out_h = (in_h * up + (py0 + py1) - (filter_up_h - 1) -\n             (filter_down_h - 1) + (down - 1)) // down\n\n    # Compute using existing ops.\n    output = bias_act(input=input, bias=bias)  # Apply bias.\n    output = upfirdn2d(\n        input=output,\n        filter=filter_up,\n        up=up,\n        padding=[px0, px1, py0, py1],\n        gain=up**2,\n        flip_filter=flip_filter)  # Upsample.\n    output = bias_act(\n        input=output, act='lrelu', alpha=slope, gain=gain,\n        clamp=clamp)  # Bias, leaky ReLU, clamp.\n    output = upfirdn2d(\n        input=output, filter=filter_down, down=down,\n        flip_filter=flip_filter)  # Downsample.\n\n    assert output.shape == (batch_size, channels, out_h, out_w)\n    assert output.dtype == in_dtype\n    return output\n\n\n_filtered_lrelu_cuda_cache: Dict = dict()\n\n\ndef _filtered_lrelu_cuda(up: int = 1,\n                         down: int = 1,\n                         padding: int = 0,\n                         gain: float = np.sqrt(2),\n                         slope: float = 0.2,\n                         clamp: Optional[Union[float, int]] = None,\n                         flip_filter: bool = False):\n    \"\"\"Fast CUDA implementation of `filtered_lrelu()` using custom ops.\n\n    Args:\n        up (int): Integer upsampling factor. Defaults to 1.\n        down (int): Integer downsampling factor. Defaults to 1.\n        padding (int): Padding with respect to the upsampled image. Can be a\n            single number or a list/tuple `[x, y]` or `[x_before, x_after,\n            y_before, y_after]`. Defaults to 0.\n        gain (float): Overall scaling factor for signal magnitude.\n            Defaults to np.sqrt(2).\n        slope (float): Slope on the negative side of leaky ReLU.\n            Defaults to 0.2.\n        clamp (float or int): Maximum magnitude for leaky ReLU\n            output. Defaults to None.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height,\n        out_width]`.\n    \"\"\"\n    assert isinstance(up, int) and up >= 1\n    assert isinstance(down, int) and down >= 1\n    px0, px1, py0, py1 = _parse_padding(padding)\n    assert gain == float(gain) and gain > 0\n    gain = float(gain)\n    assert slope == float(slope) and slope >= 0\n    slope = float(slope)\n    assert clamp is None or (clamp == float(clamp) and clamp >= 0)\n    clamp = float(clamp if clamp is not None else 'inf')\n\n    # Lookup from cache.\n    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)\n    if key in _filtered_lrelu_cuda_cache:\n        return _filtered_lrelu_cuda_cache[key]\n\n    # Forward op.\n    class FilteredLReluCuda(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):\n            # pylint: disable=arguments-differ\n            assert isinstance(input, torch.Tensor) and input.ndim == 4\n\n            # Replace empty up/downsample kernels with full 1x1 kernels\n            # (faster than separable).\n            if filter_up is None:\n                filter_up = torch.ones([1, 1],\n                                       dtype=torch.float32,\n                                       device=input.device)\n            if filter_down is None:\n                filter_down = torch.ones([1, 1],\n                                         dtype=torch.float32,\n                                         device=input.device)\n            assert 1 <= filter_up.ndim <= 2\n            assert 1 <= filter_down.ndim <= 2\n\n            # Replace separable 1x1 kernels with full 1x1 kernels when scale\n            # factor is 1.\n            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:\n                filter_up = filter_up.square()[None]\n            if down == 1 and filter_down.ndim == 1 and filter_down.shape[\n                    0] == 1:\n                filter_down = filter_down.square()[None]\n\n            # Missing sign input tensor.\n            if si is None:\n                si = torch.empty([0])\n\n            # Missing bias tensor.\n            if bias is None:\n                bias = torch.zeros([input.shape[1]],\n                                   dtype=input.dtype,\n                                   device=input.device)\n\n            # Construct internal sign tensor only if gradients are needed.\n            write_signs = (si.numel() == 0) and (input.requires_grad\n                                                 or bias.requires_grad)\n\n            # Warn if input storage strides are not in decreasing order due to\n            # e.g. channels-last layout.\n            strides = [\n                input.stride(i) for i in range(input.ndim) if input.size(i) > 1\n            ]\n            if any(a < b for a, b in zip(strides[:-1], strides[1:])):\n                warnings.warn(\n                    'low-performance memory layout detected in filtered_lrelu '\n                    'input', RuntimeWarning)\n\n            # Call C++/Cuda plugin if datatype is supported.\n            if input.dtype in [torch.float16, torch.float32]:\n                if torch.cuda.current_stream(\n                        input.device) != torch.cuda.default_stream(\n                            input.device):\n                    warnings.warn(\n                        'filtered_lrelu called with non-default cuda stream '\n                        'but concurrent execution is not supported',\n                        RuntimeWarning)\n                y, so, return_code = ext_module.filtered_lrelu(\n                    input, filter_up, filter_down, bias, si.to(input.device),\n                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,\n                    flip_filter, write_signs)\n            else:\n                return_code = -1\n\n            # No Cuda kernel found? Fall back to generic implementation.\n            # Still more memory efficient than the reference implementation\n            # because only the bit-packed sign tensor is retained for gradient\n            # computation.\n            if return_code < 0:\n                warnings.warn(\n                    'filtered_lrelu called with parameters that have no '\n                    'optimized CUDA kernel, using generic fallback',\n                    RuntimeWarning)\n\n                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.\n                y = upfirdn2d(\n                    input=y,\n                    filter=filter_up,\n                    up=up,\n                    padding=[px0, px1, py0, py1],\n                    gain=float(up**2),\n                    flip_filter=flip_filter)  # Upsample.\n                # Activation function and sign handling. Modifies y in-place.\n                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,\n                                                    gain, slope, clamp,\n                                                    write_signs)\n                y = upfirdn2d(\n                    input=y,\n                    filter=filter_down,\n                    down=down,\n                    flip_filter=flip_filter)  # Downsample.\n\n            # Prepare for gradient computation.\n            ctx.save_for_backward(filter_up, filter_down,\n                                  (si if si.numel() else so))\n            ctx.x_shape = input.shape\n            ctx.y_shape = y.shape\n            ctx.s_ofs = sx, sy\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            filter_up, filter_down, si = ctx.saved_tensors\n            _, _, xh, xw = ctx.x_shape\n            _, _, yh, yw = ctx.y_shape\n            sx, sy = ctx.s_ofs\n            dx = None  # 0\n            dfu = None\n            assert not ctx.needs_input_grad[1]\n            dfd = None\n            assert not ctx.needs_input_grad[2]\n            db = None  # 3\n            dsi = None\n            assert not ctx.needs_input_grad[4]\n            dsx = None\n            assert not ctx.needs_input_grad[5]\n            dsy = None\n            assert not ctx.needs_input_grad[6]\n\n            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:\n                pp = [\n                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -\n                    px0,\n                    xw * up - yw * down + px0 - (up - 1),\n                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -\n                    py0,\n                    xh * up - yh * down + py0 - (up - 1),\n                ]\n                gg = gain * (up**2) / (down**2)\n                ff = (not flip_filter)\n                sx = sx - (filter_up.shape[-1] - 1) + px0\n                sy = sy - (filter_up.shape[0] - 1) + py0\n                dx = _filtered_lrelu_cuda(\n                    up=down,\n                    down=up,\n                    padding=pp,\n                    gain=gg,\n                    slope=slope,\n                    clamp=None,\n                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,\n                                          sx, sy)\n\n            if ctx.needs_input_grad[3]:\n                db = dx.sum([0, 2, 3])\n\n            return dx, dfu, dfd, db, dsi, dsx, dsy\n\n    # Add to cache.\n    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda\n    return FilteredLReluCuda\n\n\n_filtered_lrelu_musa_cache: Dict = dict()\n\n\ndef _filtered_lrelu_musa(up: int = 1,\n                         down: int = 1,\n                         padding: int = 0,\n                         gain: float = np.sqrt(2),\n                         slope: float = 0.2,\n                         clamp: Optional[Union[float, int]] = None,\n                         flip_filter: bool = False):\n    \"\"\"Fast MUSA implementation of `filtered_lrelu()` using custom ops.\n\n    Args:\n        up (int): Integer upsampling factor. Defaults to 1.\n        down (int): Integer downsampling factor. Defaults to 1.\n        padding (int): Padding with respect to the upsampled image. Can be a\n            single number or a list/tuple `[x, y]` or `[x_before, x_after,\n            y_before, y_after]`. Defaults to 0.\n        gain (float): Overall scaling factor for signal magnitude.\n            Defaults to np.sqrt(2).\n        slope (float): Slope on the negative side of leaky ReLU.\n            Defaults to 0.2.\n        clamp (float or int): Maximum magnitude for leaky ReLU\n            output. Defaults to None.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height,\n        out_width]`.\n    \"\"\"\n    assert isinstance(up, int) and up >= 1\n    assert isinstance(down, int) and down >= 1\n    px0, px1, py0, py1 = _parse_padding(padding)\n    assert gain == float(gain) and gain > 0\n    gain = float(gain)\n    assert slope == float(slope) and slope >= 0\n    slope = float(slope)\n    assert clamp is None or (clamp == float(clamp) and clamp >= 0)\n    clamp = float(clamp if clamp is not None else 'inf')\n\n    # Lookup from cache.\n    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)\n    if key in _filtered_lrelu_musa_cache:\n        return _filtered_lrelu_musa_cache[key]\n\n    # Forward op.\n    class FilteredLReluMusa(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):\n            # pylint: disable=arguments-differ\n            assert isinstance(input, torch.Tensor) and input.ndim == 4\n\n            # Replace empty up/downsample kernels with full 1x1 kernels\n            # (faster than separable).\n            if filter_up is None:\n                filter_up = torch.ones([1, 1],\n                                       dtype=torch.float32,\n                                       device=input.device)\n            if filter_down is None:\n                filter_down = torch.ones([1, 1],\n                                         dtype=torch.float32,\n                                         device=input.device)\n            assert 1 <= filter_up.ndim <= 2\n            assert 1 <= filter_down.ndim <= 2\n\n            # Replace separable 1x1 kernels with full 1x1 kernels when scale\n            # factor is 1.\n            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:\n                filter_up = filter_up.square()[None]\n            if down == 1 and filter_down.ndim == 1 and filter_down.shape[\n                    0] == 1:\n                filter_down = filter_down.square()[None]\n\n            # Missing sign input tensor.\n            if si is None:\n                si = torch.empty([0])\n\n            # Missing bias tensor.\n            if bias is None:\n                bias = torch.zeros([input.shape[1]],\n                                   dtype=input.dtype,\n                                   device=input.device)\n\n            # Construct internal sign tensor only if gradients are needed.\n            write_signs = (si.numel() == 0) and (input.requires_grad\n                                                 or bias.requires_grad)\n\n            # Warn if input storage strides are not in decreasing order due to\n            # e.g. channels-last layout.\n            strides = [\n                input.stride(i) for i in range(input.ndim) if input.size(i) > 1\n            ]\n            if any(a < b for a, b in zip(strides[:-1], strides[1:])):\n                warnings.warn(\n                    'low-performance memory layout detected in filtered_lrelu '\n                    'input', RuntimeWarning)\n\n            # Call C++/MUSA plugin if datatype is supported.\n            if input.dtype in [torch.float16, torch.float32]:\n                if torch.musa.current_stream(\n                        input.device) != torch.musa.default_stream(\n                            input.device):\n                    warnings.warn(\n                        'filtered_lrelu called with non-default musa stream '\n                        'but concurrent execution is not supported',\n                        RuntimeWarning)\n                y, so, return_code = ext_module.filtered_lrelu(\n                    input, filter_up, filter_down, bias, si.to(input.device),\n                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,\n                    flip_filter, write_signs)\n            else:\n                return_code = -1\n\n            # No Musa kernel found? Fall back to generic implementation.\n            # Still more memory efficient than the reference implementation\n            # because only the bit-packed sign tensor is retained for gradient\n            # computation.\n            if return_code < 0:\n                warnings.warn(\n                    'filtered_lrelu called with parameters that have no '\n                    'optimized MUSA kernel, using generic fallback',\n                    RuntimeWarning)\n\n                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.\n                y = upfirdn2d(\n                    input=y,\n                    filter=filter_up,\n                    up=up,\n                    padding=[px0, px1, py0, py1],\n                    gain=float(up**2),\n                    flip_filter=flip_filter)  # Upsample.\n                # Activation function and sign handling. Modifies y in-place.\n                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,\n                                                    gain, slope, clamp,\n                                                    write_signs)\n                y = upfirdn2d(\n                    input=y,\n                    filter=filter_down,\n                    down=down,\n                    flip_filter=flip_filter)  # Downsample.\n\n            # Prepare for gradient computation.\n            ctx.save_for_backward(filter_up, filter_down,\n                                  (si if si.numel() else so))\n            ctx.x_shape = input.shape\n            ctx.y_shape = y.shape\n            ctx.s_ofs = sx, sy\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            filter_up, filter_down, si = ctx.saved_tensors\n            _, _, xh, xw = ctx.x_shape\n            _, _, yh, yw = ctx.y_shape\n            sx, sy = ctx.s_ofs\n            dx = None  # 0\n            dfu = None\n            assert not ctx.needs_input_grad[1]\n            dfd = None\n            assert not ctx.needs_input_grad[2]\n            db = None  # 3\n            dsi = None\n            assert not ctx.needs_input_grad[4]\n            dsx = None\n            assert not ctx.needs_input_grad[5]\n            dsy = None\n            assert not ctx.needs_input_grad[6]\n\n            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:\n                pp = [\n                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -\n                    px0,\n                    xw * up - yw * down + px0 - (up - 1),\n                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -\n                    py0,\n                    xh * up - yh * down + py0 - (up - 1),\n                ]\n                gg = gain * (up**2) / (down**2)\n                ff = (not flip_filter)\n                sx = sx - (filter_up.shape[-1] - 1) + px0\n                sy = sy - (filter_up.shape[0] - 1) + py0\n                dx = _filtered_lrelu_musa(\n                    up=down,\n                    down=up,\n                    padding=pp,\n                    gain=gg,\n                    slope=slope,\n                    clamp=None,\n                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,\n                                          sx, sy)\n\n            if ctx.needs_input_grad[3]:\n                db = dx.sum([0, 2, 3])\n\n            return dx, dfu, dfd, db, dsi, dsx, dsy\n\n    # Add to cache.\n    _filtered_lrelu_musa_cache[key] = FilteredLReluMusa\n    return FilteredLReluMusa\n"
  },
  {
    "path": "mmcv/ops/focal_loss.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Union\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',\n    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'\n])\n\n\nclass SigmoidFocalLossFunction(Function):\n\n    @staticmethod\n    def forward(ctx,\n                input: torch.Tensor,\n                target: Union[torch.LongTensor, torch.cuda.LongTensor],\n                gamma: float = 2.0,\n                alpha: float = 0.25,\n                weight: Optional[torch.Tensor] = None,\n                reduction: str = 'mean') -> torch.Tensor:\n\n        assert target.dtype == torch.long\n        assert input.dim() == 2\n        assert target.dim() == 1\n        assert input.size(0) == target.size(0)\n        if weight is None:\n            weight = input.new_empty(0)\n        else:\n            assert weight.dim() == 1\n            assert input.size(1) == weight.size(0)\n        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}\n        assert reduction in ctx.reduction_dict.keys()\n\n        ctx.gamma = float(gamma)\n        ctx.alpha = float(alpha)\n        ctx.reduction = ctx.reduction_dict[reduction]\n\n        output = input.new_zeros(input.size())\n\n        ext_module.sigmoid_focal_loss_forward(\n            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)\n        if ctx.reduction == ctx.reduction_dict['mean']:\n            output = output.sum() / input.size(0)\n        elif ctx.reduction == ctx.reduction_dict['sum']:\n            output = output.sum()\n        ctx.save_for_backward(input, target, weight)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        input, target, weight = ctx.saved_tensors\n\n        grad_input = input.new_zeros(input.size())\n\n        ext_module.sigmoid_focal_loss_backward(\n            input,\n            target,\n            weight,\n            grad_input,\n            gamma=ctx.gamma,\n            alpha=ctx.alpha)\n\n        grad_input *= grad_output\n        if ctx.reduction == ctx.reduction_dict['mean']:\n            grad_input /= input.size(0)\n        return grad_input, None, None, None, None, None\n\n\nsigmoid_focal_loss = SigmoidFocalLossFunction.apply\n\n\nclass SigmoidFocalLoss(nn.Module):\n\n    def __init__(self,\n                 gamma: float,\n                 alpha: float,\n                 weight: Optional[torch.Tensor] = None,\n                 reduction: str = 'mean'):\n        super().__init__()\n        self.gamma = gamma\n        self.alpha = alpha\n        self.register_buffer('weight', weight)\n        self.reduction = reduction\n\n    def forward(\n        self,\n        input: torch.Tensor,\n        target: Union[torch.LongTensor, torch.cuda.LongTensor],\n    ) -> torch.Tensor:\n        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,\n                                  self.weight, self.reduction)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(gamma={self.gamma}, '\n        s += f'alpha={self.alpha}, '\n        s += f'reduction={self.reduction})'\n        return s\n\n\nclass SoftmaxFocalLossFunction(Function):\n\n    @staticmethod\n    def forward(ctx,\n                input: torch.Tensor,\n                target: Union[torch.LongTensor, torch.cuda.LongTensor],\n                gamma: float = 2.0,\n                alpha: float = 0.25,\n                weight: Optional[torch.Tensor] = None,\n                reduction='mean') -> torch.Tensor:\n\n        assert target.dtype == torch.long\n        assert input.dim() == 2\n        assert target.dim() == 1\n        assert input.size(0) == target.size(0)\n        if weight is None:\n            weight = input.new_empty(0)\n        else:\n            assert weight.dim() == 1\n            assert input.size(1) == weight.size(0)\n        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}\n        assert reduction in ctx.reduction_dict.keys()\n\n        ctx.gamma = float(gamma)\n        ctx.alpha = float(alpha)\n        ctx.reduction = ctx.reduction_dict[reduction]\n\n        channel_stats, _ = torch.max(input, dim=1)\n        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)\n        input_softmax.exp_()\n\n        channel_stats = input_softmax.sum(dim=1)\n        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)\n\n        output = input.new_zeros(input.size(0))\n        ext_module.softmax_focal_loss_forward(\n            input_softmax,\n            target,\n            weight,\n            output,\n            gamma=ctx.gamma,\n            alpha=ctx.alpha)\n\n        if ctx.reduction == ctx.reduction_dict['mean']:\n            output = output.sum() / input.size(0)\n        elif ctx.reduction == ctx.reduction_dict['sum']:\n            output = output.sum()\n        ctx.save_for_backward(input_softmax, target, weight)\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        input_softmax, target, weight = ctx.saved_tensors\n        buff = input_softmax.new_zeros(input_softmax.size(0))\n        grad_input = input_softmax.new_zeros(input_softmax.size())\n\n        ext_module.softmax_focal_loss_backward(\n            input_softmax,\n            target,\n            weight,\n            buff,\n            grad_input,\n            gamma=ctx.gamma,\n            alpha=ctx.alpha)\n\n        grad_input *= grad_output\n        if ctx.reduction == ctx.reduction_dict['mean']:\n            grad_input /= input_softmax.size(0)\n        return grad_input, None, None, None, None, None\n\n\nsoftmax_focal_loss = SoftmaxFocalLossFunction.apply\n\n\nclass SoftmaxFocalLoss(nn.Module):\n\n    def __init__(self,\n                 gamma: float,\n                 alpha: float,\n                 weight: Optional[torch.Tensor] = None,\n                 reduction: str = 'mean'):\n        super().__init__()\n        self.gamma = gamma\n        self.alpha = alpha\n        self.register_buffer('weight', weight)\n        self.reduction = reduction\n\n    def forward(\n        self,\n        input: torch.Tensor,\n        target: Union[torch.LongTensor, torch.cuda.LongTensor],\n    ) -> torch.Tensor:\n        return softmax_focal_loss(input, target, self.gamma, self.alpha,\n                                  self.weight, self.reduction)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(gamma={self.gamma}, '\n        s += f'alpha={self.alpha}, '\n        s += f'reduction={self.reduction})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/furthest_point_sample.py",
    "content": "import torch\nfrom mmengine.device import is_cuda_available, is_musa_available\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'furthest_point_sampling_forward',\n    'furthest_point_sampling_with_dist_forward'\n])\n\n\nclass FurthestPointSampling(Function):\n    \"\"\"Uses iterative furthest point sampling to select a set of features whose\n    corresponding points have the furthest distance.\"\"\"\n\n    @staticmethod\n    def forward(ctx, points_xyz: torch.Tensor,\n                num_points: int) -> torch.Tensor:\n        \"\"\"\n        Args:\n            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.\n            num_points (int): Number of points in the sampled set.\n\n        Returns:\n            torch.Tensor: (B, num_points) indices of the sampled points.\n        \"\"\"\n        assert points_xyz.is_contiguous()\n\n        B, N = points_xyz.size()[:2]\n        if points_xyz.device.type == 'npu':\n            output = torch.IntTensor(B, num_points).npu()\n            temp = torch.FloatTensor(B, N).fill_(1e10).npu()\n        elif is_cuda_available():\n            output = torch.cuda.IntTensor(B, num_points)\n            temp = torch.cuda.FloatTensor(B, N).fill_(1e10)\n        elif is_musa_available():\n            output = torch.musa.IntTensor(B, num_points)\n            temp = torch.musa.FloatTensor(B, N).fill_(1e10)\n\n        ext_module.furthest_point_sampling_forward(\n            points_xyz,\n            temp,\n            output,\n            b=B,\n            n=N,\n            m=num_points,\n        )\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(output)\n        return output\n\n    @staticmethod\n    def backward(xyz, a=None):\n        return None, None\n\n\nclass FurthestPointSamplingWithDist(Function):\n    \"\"\"Uses iterative furthest point sampling to select a set of features whose\n    corresponding points have the furthest distance.\"\"\"\n\n    @staticmethod\n    def forward(ctx, points_dist: torch.Tensor,\n                num_points: int) -> torch.Tensor:\n        \"\"\"\n        Args:\n            points_dist (torch.Tensor): (B, N, N) Distance between each point\n                pair.\n            num_points (int): Number of points in the sampled set.\n\n        Returns:\n            torch.Tensor: (B, num_points) indices of the sampled points.\n        \"\"\"\n        assert points_dist.is_contiguous()\n\n        B, N, _ = points_dist.size()\n        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)\n        temp = points_dist.new_zeros([B, N]).fill_(1e10)\n\n        ext_module.furthest_point_sampling_with_dist_forward(\n            points_dist, temp, output, b=B, n=N, m=num_points)\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(output)\n        return output\n\n    @staticmethod\n    def backward(xyz, a=None):\n        return None, None\n\n\nfurthest_point_sample = FurthestPointSampling.apply\nfurthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply\n"
  },
  {
    "path": "mmcv/ops/fused_bias_leakyrelu.py",
    "content": "# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501\n\n# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.\n# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator\n# Augmentation (ADA)\n# =======================================================================\n\n# 1. Definitions\n\n# \"Licensor\" means any person or entity that distributes its Work.\n\n# \"Software\" means the original work of authorship made available under\n# this License.\n\n# \"Work\" means the Software and any additions to or derivative works of\n# the Software that are made available under this License.\n\n# The terms \"reproduce,\" \"reproduction,\" \"derivative works,\" and\n# \"distribution\" have the meaning as provided under U.S. copyright law;\n# provided, however, that for the purposes of this License, derivative\n# works shall not include works that remain separable from, or merely\n# link (or bind by name) to the interfaces of, the Work.\n\n# Works, including the Software, are \"made available\" under this License\n# by including in or with the Work either (a) a copyright notice\n# referencing the applicability of this License to the Work, or (b) a\n# copy of this License.\n\n# 2. License Grants\n\n#     2.1 Copyright Grant. Subject to the terms and conditions of this\n#     License, each Licensor grants to you a perpetual, worldwide,\n#     non-exclusive, royalty-free, copyright license to reproduce,\n#     prepare derivative works of, publicly display, publicly perform,\n#     sublicense and distribute its Work and any resulting derivative\n#     works in any form.\n\n# 3. Limitations\n\n#     3.1 Redistribution. You may reproduce or distribute the Work only\n#     if (a) you do so under this License, (b) you include a complete\n#     copy of this License with your distribution, and (c) you retain\n#     without modification any copyright, patent, trademark, or\n#     attribution notices that are present in the Work.\n\n#     3.2 Derivative Works. You may specify that additional or different\n#     terms apply to the use, reproduction, and distribution of your\n#     derivative works of the Work (\"Your Terms\") only if (a) Your Terms\n#     provide that the use limitation in Section 3.3 applies to your\n#     derivative works, and (b) you identify the specific derivative\n#     works that are subject to Your Terms. Notwithstanding Your Terms,\n#     this License (including the redistribution requirements in Section\n#     3.1) will continue to apply to the Work itself.\n\n#     3.3 Use Limitation. The Work and any derivative works thereof only\n#     may be used or intended for use non-commercially. Notwithstanding\n#     the foregoing, NVIDIA and its affiliates may use the Work and any\n#     derivative works commercially. As used herein, \"non-commercially\"\n#     means for research or evaluation purposes only.\n\n#     3.4 Patent Claims. If you bring or threaten to bring a patent claim\n#     against any Licensor (including any claim, cross-claim or\n#     counterclaim in a lawsuit) to enforce any patents that you allege\n#     are infringed by any Work, then your rights under this License from\n#     such Licensor (including the grant in Section 2.1) will terminate\n#     immediately.\n\n#     3.5 Trademarks. This License does not grant any rights to use any\n#     Licensor’s or its affiliates’ names, logos, or trademarks, except\n#     as necessary to reproduce the notices described in this License.\n\n#     3.6 Termination. If you violate any term of this License, then your\n#     rights under this License (including the grant in Section 2.1) will\n#     terminate immediately.\n\n# 4. Disclaimer of Warranty.\n\n# THE WORK IS PROVIDED \"AS IS\" WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF\n# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR\n# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER\n# THIS LICENSE.\n\n# 5. Limitation of Liability.\n\n# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL\n# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE\n# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,\n# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF\n# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK\n# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,\n# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER\n# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF\n# THE POSSIBILITY OF SUCH DAMAGES.\n\n# =======================================================================\n\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])\n\n\nclass FusedBiasLeakyReLUFunctionBackward(Function):\n    \"\"\"Calculate second order deviation.\n\n    This function is to compute the second order deviation for the fused leaky\n    relu operation.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,\n                negative_slope: float, scale: float) -> tuple:\n        ctx.save_for_backward(out)\n        ctx.negative_slope = negative_slope\n        ctx.scale = scale\n\n        empty = grad_output.new_empty(0)\n\n        grad_input = ext_module.fused_bias_leakyrelu(\n            grad_output,\n            empty,\n            out,\n            act=3,\n            grad=1,\n            alpha=negative_slope,\n            scale=scale)\n\n        dim = [0]\n\n        if grad_input.ndim > 2:\n            dim += list(range(2, grad_input.ndim))\n\n        grad_bias = grad_input.sum(dim).detach()\n\n        return grad_input, grad_bias\n\n    @staticmethod\n    def backward(ctx, gradgrad_input: torch.Tensor,\n                 gradgrad_bias: nn.Parameter) -> tuple:\n        out, = ctx.saved_tensors\n\n        # The second order deviation, in fact, contains two parts, while the\n        # the first part is zero. Thus, we direct consider the second part\n        # which is similar with the first order deviation in implementation.\n        gradgrad_out = ext_module.fused_bias_leakyrelu(\n            gradgrad_input,\n            gradgrad_bias.to(out.dtype),\n            out,\n            act=3,\n            grad=1,\n            alpha=ctx.negative_slope,\n            scale=ctx.scale)\n\n        return gradgrad_out, None, None, None\n\n\nclass FusedBiasLeakyReLUFunction(Function):\n\n    @staticmethod\n    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,\n                negative_slope: float, scale: float) -> torch.Tensor:\n        empty = input.new_empty(0)\n\n        out = ext_module.fused_bias_leakyrelu(\n            input,\n            bias,\n            empty,\n            act=3,\n            grad=0,\n            alpha=negative_slope,\n            scale=scale)\n        ctx.save_for_backward(out)\n        ctx.negative_slope = negative_slope\n        ctx.scale = scale\n\n        return out\n\n    @staticmethod\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        out, = ctx.saved_tensors\n\n        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(\n            grad_output, out, ctx.negative_slope, ctx.scale)\n\n        return grad_input, grad_bias, None, None\n\n\nclass FusedBiasLeakyReLU(nn.Module):\n    r\"\"\"Fused bias leaky ReLU.\n\n    This function is introduced in the StyleGAN2:\n    `Analyzing and Improving the Image Quality of StyleGAN\n    <http://arxiv.org/abs/1912.04958>`_\n\n    The bias term comes from the convolution operation. In addition, to keep\n    the variance of the feature map or gradients unchanged, they also adopt a\n    scale similarly with Kaiming initialization. However, since the\n    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the\n    final scale is just :math:`\\sqrt{2}`. Of course, you may change it with\n    your own scale.\n\n    TODO: Implement the CPU version.\n\n    Args:\n        num_channels (int): The channel number of the feature map.\n        negative_slope (float, optional): Same as nn.LeakyRelu.\n            Defaults to 0.2.\n        scale (float, optional): A scalar to adjust the variance of the feature\n            map. Defaults to 2**0.5.\n    \"\"\"\n\n    def __init__(self,\n                 num_channels: int,\n                 negative_slope: float = 0.2,\n                 scale: float = 2**0.5):\n        super().__init__()\n\n        self.bias = nn.Parameter(torch.zeros(num_channels))\n        self.negative_slope = negative_slope\n        self.scale = scale\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,\n                                    self.scale)\n\n\ndef fused_bias_leakyrelu(input: torch.Tensor,\n                         bias: nn.Parameter,\n                         negative_slope: float = 0.2,\n                         scale: float = 2**0.5) -> torch.Tensor:\n    r\"\"\"Fused bias leaky ReLU function.\n\n    This function is introduced in the StyleGAN2:\n    `Analyzing and Improving the Image Quality of StyleGAN\n    <http://arxiv.org/abs/1912.04958>`_\n\n    The bias term comes from the convolution operation. In addition, to keep\n    the variance of the feature map or gradients unchanged, they also adopt a\n    scale similarly with Kaiming initialization. However, since the\n    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the\n    final scale is just :math:`\\sqrt{2}`. Of course, you may change it with\n    your own scale.\n\n    Args:\n        input (torch.Tensor): Input feature map.\n        bias (nn.Parameter): The bias from convolution operation.\n        negative_slope (float, optional): Same as nn.LeakyRelu.\n            Defaults to 0.2.\n        scale (float, optional): A scalar to adjust the variance of the feature\n            map. Defaults to 2**0.5.\n\n    Returns:\n        torch.Tensor: Feature map after non-linear activation.\n    \"\"\"\n\n    if (not input.is_cuda) and (not input.is_musa) and \\\n            input.device.type != 'npu':\n        return bias_leakyrelu_ref(input, bias, negative_slope, scale)\n\n    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),\n                                            negative_slope, scale)\n\n\ndef bias_leakyrelu_ref(x: torch.Tensor,\n                       bias: nn.Parameter,\n                       negative_slope: float = 0.2,\n                       scale: float = 2**0.5) -> torch.Tensor:\n\n    if bias is not None:\n        assert bias.ndim == 1\n        assert bias.shape[0] == x.shape[1]\n        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])\n\n    x = F.leaky_relu(x, negative_slope)\n    if scale != 1:\n        x = x * scale\n\n    return x\n"
  },
  {
    "path": "mmcv/ops/gather_points.py",
    "content": "from typing import Tuple\n\nimport torch\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['gather_points_forward', 'gather_points_backward'])\n\n\nclass GatherPoints(Function):\n    \"\"\"Gather points with given index.\"\"\"\n\n    @staticmethod\n    def forward(ctx, features: torch.Tensor,\n                indices: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): (B, C, N) features to gather.\n            indices (torch.Tensor): (B, M) where M is the number of points.\n\n        Returns:\n            torch.Tensor: (B, C, M) where M is the number of points.\n        \"\"\"\n        assert features.is_contiguous()\n        assert indices.is_contiguous()\n\n        B, npoint = indices.size()\n        _, C, N = features.size()\n        output = features.new_zeros((B, C, npoint))\n\n        ext_module.gather_points_forward(\n            features, indices, output, b=B, c=C, n=N, npoints=npoint)\n\n        ctx.for_backwards = (indices, C, N)\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(indices)\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:\n        idx, C, N = ctx.for_backwards\n        B, npoint = idx.size()\n\n        grad_features = grad_out.new_zeros((B, C, N))\n        grad_out_data = grad_out.data.contiguous()\n        ext_module.gather_points_backward(\n            grad_out_data,\n            idx,\n            grad_features.data,\n            b=B,\n            c=C,\n            n=N,\n            npoints=npoint)\n        return grad_features, None\n\n\ngather_points = GatherPoints.apply\n"
  },
  {
    "path": "mmcv/ops/group_points.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Tuple, Union\n\nimport torch\nfrom torch import nn as nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\nfrom .ball_query import ball_query\nfrom .knn import knn\n\next_module = ext_loader.load_ext('_ext', [\n    'group_points_forward', 'group_points_backward',\n    'stack_group_points_forward', 'stack_group_points_backward'\n])\n\n\nclass QueryAndGroup(nn.Module):\n    \"\"\"Groups points with a ball query of radius.\n\n    Args:\n        max_radius (float): The maximum radius of the balls.\n            If None is given, we will use kNN sampling instead of ball query.\n        sample_num (int): Maximum number of features to gather in the ball.\n        min_radius (float, optional): The minimum radius of the balls.\n            Default: 0.\n        use_xyz (bool, optional): Whether to use xyz.\n            Default: True.\n        return_grouped_xyz (bool, optional): Whether to return grouped xyz.\n            Default: False.\n        normalize_xyz (bool, optional): Whether to normalize xyz.\n            Default: False.\n        uniform_sample (bool, optional): Whether to sample uniformly.\n            Default: False\n        return_unique_cnt (bool, optional): Whether to return the count of\n            unique samples. Default: False.\n        return_grouped_idx (bool, optional): Whether to return grouped idx.\n            Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 max_radius: float,\n                 sample_num: int,\n                 min_radius: float = 0.,\n                 use_xyz: bool = True,\n                 return_grouped_xyz: bool = False,\n                 normalize_xyz: bool = False,\n                 uniform_sample: bool = False,\n                 return_unique_cnt: bool = False,\n                 return_grouped_idx: bool = False):\n        super().__init__()\n        self.max_radius = max_radius\n        self.min_radius = min_radius\n        self.sample_num = sample_num\n        self.use_xyz = use_xyz\n        self.return_grouped_xyz = return_grouped_xyz\n        self.normalize_xyz = normalize_xyz\n        self.uniform_sample = uniform_sample\n        self.return_unique_cnt = return_unique_cnt\n        self.return_grouped_idx = return_grouped_idx\n        if self.return_unique_cnt:\n            assert self.uniform_sample, \\\n                'uniform_sample should be True when ' \\\n                'returning the count of unique samples'\n        if self.max_radius is None:\n            assert not self.normalize_xyz, \\\n                'can not normalize grouped xyz when max_radius is None'\n\n    def forward(\n        self,\n        points_xyz: torch.Tensor,\n        center_xyz: torch.Tensor,\n        features: Optional[torch.Tensor] = None,\n    ) -> Union[torch.Tensor, Tuple]:\n        \"\"\"\n        Args:\n            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the\n                points.\n            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the\n                centriods.\n            features (torch.Tensor): (B, C, N) The features of grouped\n                points.\n\n        Returns:\n            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped\n            concatenated coordinates and features of points.\n        \"\"\"\n        # if self.max_radius is None, we will perform kNN instead of ball query\n        # idx is of shape [B, npoint, sample_num]\n        if self.max_radius is None:\n            idx = knn(self.sample_num, points_xyz, center_xyz, False)\n            idx = idx.transpose(1, 2).contiguous()\n        else:\n            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,\n                             points_xyz, center_xyz)\n\n        if self.uniform_sample:\n            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))\n            for i_batch in range(idx.shape[0]):\n                for i_region in range(idx.shape[1]):\n                    unique_ind = torch.unique(idx[i_batch, i_region, :])\n                    num_unique = unique_ind.shape[0]\n                    unique_cnt[i_batch, i_region] = num_unique\n                    sample_ind = torch.randint(\n                        0,\n                        num_unique, (self.sample_num - num_unique, ),\n                        dtype=torch.long)\n                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))\n                    idx[i_batch, i_region, :] = all_ind\n\n        xyz_trans = points_xyz.transpose(1, 2).contiguous()\n        # (B, 3, npoint, sample_num)\n        grouped_xyz = grouping_operation(xyz_trans, idx)\n        grouped_xyz_diff = grouped_xyz - \\\n            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets\n        if self.normalize_xyz:\n            grouped_xyz_diff /= self.max_radius\n\n        if features is not None:\n            grouped_features = grouping_operation(features, idx)\n            if self.use_xyz:\n                # (B, C + 3, npoint, sample_num)\n                new_features = torch.cat([grouped_xyz_diff, grouped_features],\n                                         dim=1)\n            else:\n                new_features = grouped_features\n        else:\n            assert (self.use_xyz\n                    ), 'Cannot have not features and not use xyz as a feature!'\n            new_features = grouped_xyz_diff\n\n        ret = [new_features]\n        if self.return_grouped_xyz:\n            ret.append(grouped_xyz)\n        if self.return_unique_cnt:\n            ret.append(unique_cnt)\n        if self.return_grouped_idx:\n            ret.append(idx)\n        if len(ret) == 1:\n            return ret[0]\n        else:\n            return tuple(ret)\n\n\nclass GroupAll(nn.Module):\n    \"\"\"Group xyz with feature.\n\n    Args:\n        use_xyz (bool): Whether to use xyz.\n    \"\"\"\n\n    def __init__(self, use_xyz: bool = True):\n        super().__init__()\n        self.use_xyz = use_xyz\n\n    def forward(self,\n                xyz: torch.Tensor,\n                new_xyz: torch.Tensor,\n                features: Optional[torch.Tensor] = None) -> torch.Tensor:\n        \"\"\"\n        Args:\n            xyz (Tensor): (B, N, 3) xyz coordinates of the features.\n            new_xyz (Tensor): new xyz coordinates of the features.\n            features (Tensor): (B, C, N) features to group.\n\n        Returns:\n            Tensor: (B, C + 3, 1, N) Grouped feature.\n        \"\"\"\n        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)\n        if features is not None:\n            grouped_features = features.unsqueeze(2)\n            if self.use_xyz:\n                # (B, 3 + C, 1, N)\n                new_features = torch.cat([grouped_xyz, grouped_features],\n                                         dim=1)\n            else:\n                new_features = grouped_features\n        else:\n            new_features = grouped_xyz\n\n        return new_features\n\n\nclass GroupingOperation(Function):\n    \"\"\"Group feature with given index.\"\"\"\n\n    @staticmethod\n    def forward(\n            ctx,\n            features: torch.Tensor,\n            indices: torch.Tensor,\n            features_batch_cnt: Optional[torch.Tensor] = None,\n            indices_batch_cnt: Optional[torch.Tensor] = None) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (Tensor): Tensor of features to group, input shape is\n                (B, C, N) or stacked inputs (N1 + N2 ..., C).\n            indices (Tensor):  The indices of features to group with, input\n                shape is (B, npoint, nsample) or stacked inputs\n                (M1 + M2 ..., nsample).\n            features_batch_cnt (Tensor, optional): Input features nums in\n                each batch, just like (N1, N2, ...). Defaults to None.\n                New in version 1.7.0.\n            indices_batch_cnt (Tensor, optional): Input indices nums in\n                each batch, just like (M1, M2, ...). Defaults to None.\n                New in version 1.7.0.\n\n        Returns:\n            Tensor: Grouped features, the shape is (B, C, npoint, nsample)\n            or (M1 + M2 ..., C, nsample).\n        \"\"\"\n        features = features.contiguous()\n        indices = indices.contiguous()\n        if features_batch_cnt is not None and indices_batch_cnt is not None:\n            assert features_batch_cnt.dtype == torch.int\n            assert indices_batch_cnt.dtype == torch.int\n            M, nsample = indices.size()\n            N, C = features.size()\n            B = indices_batch_cnt.shape[0]\n            output = features.new_zeros((M, C, nsample))\n            ext_module.stack_group_points_forward(\n                features,\n                features_batch_cnt,\n                indices,\n                indices_batch_cnt,\n                output,\n                b=B,\n                m=M,\n                c=C,\n                nsample=nsample)\n            ctx.for_backwards = (B, N, indices, features_batch_cnt,\n                                 indices_batch_cnt)\n        else:\n            B, nfeatures, nsample = indices.size()\n            _, C, N = features.size()\n            output = features.new_zeros(B, C, nfeatures, nsample)\n\n            ext_module.group_points_forward(\n                features,\n                indices,\n                output,\n                b=B,\n                c=C,\n                n=N,\n                npoints=nfeatures,\n                nsample=nsample)\n\n            ctx.for_backwards = (indices, N)\n        return output\n\n    @staticmethod\n    def backward(ctx, grad_out: torch.Tensor) -> Tuple:\n        \"\"\"\n        Args:\n            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients\n                of the output from forward.\n\n        Returns:\n            Tensor: (B, C, N) gradient of the features.\n        \"\"\"\n        if len(ctx.for_backwards) != 5:\n            idx, N = ctx.for_backwards\n\n            B, C, npoint, nsample = grad_out.size()\n            grad_features = grad_out.new_zeros(B, C, N)\n\n            grad_out_data = grad_out.data.contiguous()\n            ext_module.group_points_backward(\n                grad_out_data,\n                idx,\n                grad_features.data,\n                b=B,\n                c=C,\n                n=N,\n                npoints=npoint,\n                nsample=nsample)\n            return grad_features, None\n        else:\n            B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards\n\n            M, C, nsample = grad_out.size()\n            grad_features = grad_out.new_zeros(N, C)\n\n            grad_out_data = grad_out.data.contiguous()\n            ext_module.stack_group_points_backward(\n                grad_out_data,\n                idx,\n                idx_batch_cnt,\n                features_batch_cnt,\n                grad_features.data,\n                b=B,\n                c=C,\n                m=M,\n                n=N,\n                nsample=nsample)\n            return grad_features, None, None, None\n\n\ngrouping_operation = GroupingOperation.apply\n"
  },
  {
    "path": "mmcv/ops/info.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nif torch.__version__ == 'parrots':\n    import parrots\n\n    def get_compiler_version():\n        return 'GCC ' + parrots.version.compiler\n\n    def get_compiling_cuda_version():\n        return parrots.version.cuda\nelse:\n    from ..utils import ext_loader\n    ext_module = ext_loader.load_ext(\n        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])\n\n    def get_compiler_version():\n        return ext_module.get_compiler_version()\n\n    def get_compiling_cuda_version():\n        return ext_module.get_compiling_cuda_version()\n"
  },
  {
    "path": "mmcv/ops/iou3d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom typing import Optional\n\nimport torch\nfrom torch import Tensor\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',\n    'iou3d_nms3d_normal_forward'\n])\n\n\ndef boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:\n    \"\"\"Calculate boxes BEV overlap.\n\n    Args:\n        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).\n        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).\n\n    Returns:\n        torch.Tensor: BEV overlap result with shape (M, N).\n    \"\"\"\n    ans_overlap = boxes_a.new_zeros(\n        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))\n    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),\n                                               boxes_b.contiguous(),\n                                               ans_overlap)\n\n    return ans_overlap\n\n\ndef boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:\n    \"\"\"Calculate boxes 3D IoU.\n\n    Args:\n        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).\n        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).\n\n    Returns:\n        torch.Tensor: 3D IoU result with shape (M, N).\n    \"\"\"\n    assert boxes_a.shape[1] == boxes_b.shape[1] == 7, \\\n        'Input boxes shape should be (N, 7)'\n\n    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)\n    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)\n    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)\n    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)\n\n    overlaps_bev = boxes_a.new_zeros(\n        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))\n    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),\n                                               boxes_b.contiguous(),\n                                               overlaps_bev)\n\n    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)\n    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)\n    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)\n    overlaps_3d = overlaps_bev * overlaps_h\n    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)\n    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)\n    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)\n    return iou3d\n\n\ndef nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:\n    \"\"\"3D NMS function GPU implementation (for BEV boxes).\n\n    Args:\n        boxes (torch.Tensor): Input boxes with the shape of (N, 7)\n            ([x, y, z, dx, dy, dz, heading]).\n        scores (torch.Tensor): Scores of boxes with the shape of (N).\n        iou_threshold (float): Overlap threshold of NMS.\n\n    Returns:\n        torch.Tensor: Indexes after NMS.\n    \"\"\"\n    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'\n    order = scores.sort(0, descending=True)[1]\n    boxes = boxes[order].contiguous()\n\n    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)\n    num_out = boxes.new_zeros(size=(), dtype=torch.long)\n    ext_module.iou3d_nms3d_forward(\n        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)\n    keep = order[keep[:num_out].to(boxes.device)].contiguous()\n    return keep\n\n\ndef nms3d_normal(boxes: Tensor, scores: Tensor,\n                 iou_threshold: float) -> Tensor:\n    \"\"\"Normal 3D NMS function GPU implementation. The overlap of two boxes for\n    IoU calculation is defined as the exact overlapping area of the two boxes\n    WITH their yaw angle set to 0.\n\n    Args:\n        boxes (torch.Tensor): Input boxes with shape (N, 7).\n            ([x, y, z, dx, dy, dz, heading]).\n        scores (torch.Tensor): Scores of predicted boxes with shape (N).\n        iou_threshold (float): Overlap threshold of NMS.\n\n    Returns:\n        torch.Tensor: Remaining indices with scores in descending order.\n    \"\"\"\n    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'\n    order = scores.sort(0, descending=True)[1]\n    boxes = boxes[order].contiguous()\n\n    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)\n    num_out = boxes.new_zeros(size=(), dtype=torch.long)\n    ext_module.iou3d_nms3d_normal_forward(\n        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)\n    return order[keep[:num_out].to(boxes.device)].contiguous()\n\n\ndef _xyxyr2xywhr(boxes: Tensor) -> Tensor:\n    \"\"\"Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.\n\n    Args:\n        box (torch.Tensor): Input boxes with shape (N, 5).\n\n    Returns:\n        torch.Tensor: Converted boxes with shape (N, 7).\n    \"\"\"\n    warnings.warn(\n        'This function is deprecated and will be removed in the future.',\n        DeprecationWarning)\n    return torch.stack(\n        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,\n         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),\n        dim=-1)\n\n\ndef boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:\n    \"\"\"Calculate boxes IoU in the Bird's Eye View.\n\n    Args:\n        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)\n            ([x1, y1, x2, y2, ry]).\n        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)\n            ([x1, y1, x2, y2, ry]).\n\n    Returns:\n        torch.Tensor: IoU result with shape (M, N).\n    \"\"\"\n    from .box_iou_rotated import box_iou_rotated\n\n    warnings.warn(\n        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'\n        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',\n        DeprecationWarning)\n\n    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))\n\n\ndef nms_bev(boxes: Tensor,\n            scores: Tensor,\n            thresh: float,\n            pre_max_size: Optional[int] = None,\n            post_max_size: Optional[int] = None) -> Tensor:\n    \"\"\"NMS function GPU implementation (for BEV boxes).\n\n    The overlap of two boxes for IoU calculation is defined as the exact\n    overlapping area of the two boxes. In this function, one can also\n    set ``pre_max_size`` and ``post_max_size``.\n\n    Args:\n        boxes (torch.Tensor): Input boxes with the shape of (N, 5)\n            ([x1, y1, x2, y2, ry]).\n        scores (torch.Tensor): Scores of boxes with the shape of (N,).\n        thresh (float): Overlap threshold of NMS.\n        pre_max_size (int, optional): Max size of boxes before NMS.\n            Default: None.\n        post_max_size (int, optional): Max size of boxes after NMS.\n            Default: None.\n\n    Returns:\n        torch.Tensor: Indexes after NMS.\n    \"\"\"\n    from .nms import nms_rotated\n\n    warnings.warn(\n        '`iou3d.nms_bev` is deprecated and will be removed in'\n        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)\n    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'\n    order = scores.sort(0, descending=True)[1]\n\n    if pre_max_size is not None:\n        order = order[:pre_max_size]\n    boxes = _xyxyr2xywhr(boxes)[order]\n    scores = scores[order]\n\n    keep = nms_rotated(boxes, scores, thresh)[1]\n    keep = order[keep]\n\n    if post_max_size is not None:\n        keep = keep[:post_max_size]\n    return keep\n\n\ndef nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:\n    \"\"\"Normal NMS function GPU implementation (for BEV boxes).\n\n    The overlap of two boxes for IoU calculation is defined as the exact\n    overlapping area of the two boxes WITH their yaw angle set to 0.\n\n    Args:\n        boxes (torch.Tensor): Input boxes with shape (N, 5)\n            ([x1, y1, x2, y2, ry]).\n        scores (torch.Tensor): Scores of predicted boxes with shape (N,).\n        thresh (float): Overlap threshold of NMS.\n\n    Returns:\n        torch.Tensor: Remaining indices with scores in descending order.\n    \"\"\"\n    from .nms import nms\n\n    warnings.warn(\n        '`iou3d.nms_normal_bev` is deprecated and will be removed in'\n        ' the future. Please, use `nms.nms`.', DeprecationWarning)\n    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'\n\n    return nms(boxes[:, :-1], scores, thresh)[1]\n"
  },
  {
    "path": "mmcv/ops/knn.py",
    "content": "from typing import Optional\n\nimport torch\nfrom mmengine.device import is_cuda_available, is_musa_available\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['knn_forward'])\n\n\nclass KNN(Function):\n    r\"\"\"KNN (CUDA/MUSA) based on heap data structure.\n\n    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/\n    scene_seg/lib/pointops/src/knnquery_heap>`_.\n\n    Find k-nearest points.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx,\n                k: int,\n                xyz: torch.Tensor,\n                center_xyz: Optional[torch.Tensor] = None,\n                transposed: bool = False) -> torch.Tensor:\n        \"\"\"\n        Args:\n            k (int): number of nearest neighbors.\n            xyz (torch.Tensor): (B, N, 3) if transposed == False, else\n                (B, 3, N). xyz coordinates of the features.\n            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed\n                is False, else (B, 3, npoint). centers of the knn query.\n                Default: None.\n            transposed (bool, optional): whether the input tensors are\n                transposed. Should not explicitly use this keyword when\n                calling knn (=KNN.apply), just add the fourth param.\n                Default: False.\n\n        Returns:\n            torch.Tensor: (B, k, npoint) tensor with the indices of the\n            features that form k-nearest neighbours.\n        \"\"\"\n        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'\n\n        if center_xyz is None:\n            center_xyz = xyz\n\n        if transposed:\n            xyz = xyz.transpose(2, 1).contiguous()\n            center_xyz = center_xyz.transpose(2, 1).contiguous()\n\n        assert xyz.is_contiguous()  # [B, N, 3]\n        assert center_xyz.is_contiguous()  # [B, npoint, 3]\n\n        center_xyz_device = center_xyz.get_device()\n        assert center_xyz_device == xyz.get_device(), \\\n            'center_xyz and xyz should be put on the same device'\n        if xyz.device.type != 'npu' and is_cuda_available():\n            if torch.cuda.current_device() != center_xyz_device:\n                torch.cuda.set_device(center_xyz_device)\n        if xyz.device.type != 'npu' and is_musa_available():\n            if torch.musa.current_device() != center_xyz_device:\n                torch.musa.set_device(center_xyz_device)\n\n        B, npoint, _ = center_xyz.shape\n        N = xyz.shape[1]\n\n        if xyz.device.type == 'npu':\n            dist2 = center_xyz.new_zeros((B, npoint, k)).float()\n            idx = center_xyz.new_zeros((B, npoint, k)).int()\n            ext_module.knn_forward(\n                xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)\n            zeros_idx = torch.zeros(\n                xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu()\n            idx.where(dist2 >= 1e10, zeros_idx)\n            idx = idx.transpose(2, 1).contiguous()  # [B, k, npoint]\n            return idx.int()\n\n        idx = center_xyz.new_zeros((B, npoint, k)).int()\n        dist2 = center_xyz.new_zeros((B, npoint, k)).float()\n\n        ext_module.knn_forward(\n            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)\n        # idx shape to [B, k, npoint]\n        idx = idx.transpose(2, 1).contiguous()\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(idx)\n        return idx\n\n    @staticmethod\n    def backward(ctx, a=None):\n        return None, None, None\n\n\nknn = KNN.apply\n"
  },
  {
    "path": "mmcv/ops/masked_conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nfrom typing import Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])\n\n\nclass MaskedConv2dFunction(Function):\n\n    @staticmethod\n    def symbolic(g, features, mask, weight, bias, padding, stride=1):\n        return g.op(\n            'mmcv::MMCVMaskedConv2d',\n            features,\n            mask,\n            weight,\n            bias,\n            padding_i=padding,\n            stride_i=stride)\n\n    @staticmethod\n    def forward(ctx,\n                features: torch.Tensor,\n                mask: torch.Tensor,\n                weight: torch.nn.Parameter,\n                bias: torch.nn.Parameter,\n                padding: int = 0,\n                stride: int = 1) -> torch.Tensor:\n        assert mask.dim() == 3 and mask.size(0) == 1\n        assert features.dim() == 4 and features.size(0) == 1\n        assert features.size()[2:] == mask.size()[1:]\n        pad_h, pad_w = _pair(padding)\n        stride_h, stride_w = _pair(stride)\n        if stride_h != 1 or stride_w != 1:\n            raise ValueError(\n                'Stride could not only be 1 in masked_conv2d currently.')\n        out_channel, in_channel, kernel_h, kernel_w = weight.size()\n\n        if features.device.type == 'npu':\n            import torch_npu\n            output = torch_npu.npu_conv2d(\n                features,\n                weight,\n                bias,\n                stride=(stride_h, stride_w),\n                padding=(pad_h, pad_w),\n                dilation=(1, 1),\n                groups=1)\n            if mask.size()[1:] != output.size()[2:]:\n                raise ValueError(\n                    'The mask is inconsistent with the shape of output_conv.')\n            mask = mask > 0\n            mask = mask.type(output.dtype)\n            output = output * mask\n            return output\n\n        batch_size = features.size(0)\n        out_h = int(\n            math.floor(\n                torch.true_divide((features.size(2) + 2 * pad_h -\n                                   (kernel_h - 1) - 1), stride_h) + 1))\n        out_w = int(\n            math.floor(\n                torch.true_divide((features.size(3) + 2 * pad_w -\n                                   (kernel_w - 1) - 1), stride_w) + 1))\n        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)\n        output = features.new_zeros(batch_size, out_channel, out_h, out_w)\n        if mask_inds.numel() > 0:\n            mask_h_idx = mask_inds[:, 0].contiguous()\n            mask_w_idx = mask_inds[:, 1].contiguous()\n            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,\n                                          mask_inds.size(0))\n            ext_module.masked_im2col_forward(\n                features,\n                mask_h_idx,\n                mask_w_idx,\n                data_col,\n                kernel_h=kernel_h,\n                kernel_w=kernel_w,\n                pad_h=pad_h,\n                pad_w=pad_w)\n            masked_output = torch.addmm(1, bias[:, None], 1,\n                                        weight.view(out_channel, -1), data_col)\n            ext_module.masked_col2im_forward(\n                masked_output,\n                mask_h_idx,\n                mask_w_idx,\n                output,\n                height=out_h,\n                width=out_w,\n                channels=out_channel)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        return (None, ) * 5\n\n\nmasked_conv2d = MaskedConv2dFunction.apply\n\n\nclass MaskedConv2d(nn.Conv2d):\n    \"\"\"A MaskedConv2d which inherits the official Conv2d.\n\n    The masked forward doesn't implement the backward function and only\n    supports the stride parameter to be 1 currently.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int, ...]],\n                 stride: int = 1,\n                 padding: int = 0,\n                 dilation: int = 1,\n                 groups: int = 1,\n                 bias: bool = True):\n        super().__init__(in_channels, out_channels, kernel_size, stride,\n                         padding, dilation, groups, bias)\n\n    def forward(self,\n                input: torch.Tensor,\n                mask: Optional[torch.Tensor] = None) -> torch.Tensor:\n        if mask is None:  # fallback to the normal Conv2d\n            return super().forward(input)\n        else:\n            return masked_conv2d(input, mask, self.weight, self.bias,\n                                 self.padding)\n"
  },
  {
    "path": "mmcv/ops/merge_cells.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nfrom abc import abstractmethod\nfrom typing import Optional\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ..cnn import ConvModule\n\n\nclass BaseMergeCell(nn.Module):\n    \"\"\"The basic class for cells used in NAS-FPN and NAS-FCOS.\n\n    BaseMergeCell takes 2 inputs. After applying convolution\n    on them, they are resized to the target size. Then,\n    they go through binary_op, which depends on the type of cell.\n    If with_out_conv is True, the result of output will go through\n    another convolution layer.\n\n    Args:\n        fused_channels (int): number of input channels in out_conv layer.\n        out_channels (int): number of output channels in out_conv layer.\n        with_out_conv (bool): Whether to use out_conv layer\n        out_conv_cfg (dict): Config dict for convolution layer, which should\n            contain \"groups\", \"kernel_size\", \"padding\", \"bias\" to build\n            out_conv layer.\n        out_norm_cfg (dict): Config dict for normalization layer in out_conv.\n        out_conv_order (tuple): The order of conv/norm/activation layers in\n            out_conv.\n        with_input1_conv (bool): Whether to use convolution on input1.\n        with_input2_conv (bool): Whether to use convolution on input2.\n        input_conv_cfg (dict): Config dict for building input1_conv layer and\n            input2_conv layer, which is expected to contain the type of\n            convolution.\n            Default: None, which means using conv2d.\n        input_norm_cfg (dict): Config dict for normalization layer in\n            input1_conv and input2_conv layer. Default: None.\n        upsample_mode (str): Interpolation method used to resize the output\n            of input1_conv and input2_conv to target size. Currently, we\n            support ['nearest', 'bilinear']. Default: 'nearest'.\n    \"\"\"\n\n    def __init__(self,\n                 fused_channels: Optional[int] = 256,\n                 out_channels: Optional[int] = 256,\n                 with_out_conv: bool = True,\n                 out_conv_cfg: dict = dict(\n                     groups=1, kernel_size=3, padding=1, bias=True),\n                 out_norm_cfg: Optional[dict] = None,\n                 out_conv_order: tuple = ('act', 'conv', 'norm'),\n                 with_input1_conv: bool = False,\n                 with_input2_conv: bool = False,\n                 input_conv_cfg: Optional[dict] = None,\n                 input_norm_cfg: Optional[dict] = None,\n                 upsample_mode: str = 'nearest'):\n        super().__init__()\n        assert upsample_mode in ['nearest', 'bilinear']\n        self.with_out_conv = with_out_conv\n        self.with_input1_conv = with_input1_conv\n        self.with_input2_conv = with_input2_conv\n        self.upsample_mode = upsample_mode\n\n        if self.with_out_conv:\n            self.out_conv = ConvModule(\n                fused_channels,  # type: ignore\n                out_channels,  # type: ignore\n                **out_conv_cfg,\n                norm_cfg=out_norm_cfg,\n                order=out_conv_order)\n\n        self.input1_conv = self._build_input_conv(\n            out_channels, input_conv_cfg,\n            input_norm_cfg) if with_input1_conv else nn.Sequential()\n        self.input2_conv = self._build_input_conv(\n            out_channels, input_conv_cfg,\n            input_norm_cfg) if with_input2_conv else nn.Sequential()\n\n    def _build_input_conv(self, channel, conv_cfg, norm_cfg):\n        return ConvModule(\n            channel,\n            channel,\n            3,\n            padding=1,\n            conv_cfg=conv_cfg,\n            norm_cfg=norm_cfg,\n            bias=True)\n\n    @abstractmethod\n    def _binary_op(self, x1, x2):\n        pass\n\n    def _resize(self, x, size):\n        if x.shape[-2:] == size:\n            return x\n        elif x.shape[-2:] < size:\n            return F.interpolate(x, size=size, mode=self.upsample_mode)\n        else:\n            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:\n                h, w = x.shape[-2:]\n                target_h, target_w = size\n                pad_h = math.ceil(h / target_h) * target_h - h\n                pad_w = math.ceil(w / target_w) * target_w - w\n                pad_l = pad_w // 2\n                pad_r = pad_w - pad_l\n                pad_t = pad_h // 2\n                pad_b = pad_h - pad_t\n                pad = (pad_l, pad_r, pad_t, pad_b)\n                x = F.pad(x, pad, mode='constant', value=0.0)\n            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])\n            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)\n            return x\n\n    def forward(self,\n                x1: torch.Tensor,\n                x2: torch.Tensor,\n                out_size: Optional[tuple] = None) -> torch.Tensor:\n        assert x1.shape[:2] == x2.shape[:2]\n        assert out_size is None or len(out_size) == 2\n        if out_size is None:  # resize to larger one\n            out_size = max(x1.size()[2:], x2.size()[2:])\n\n        x1 = self.input1_conv(x1)\n        x2 = self.input2_conv(x2)\n\n        x1 = self._resize(x1, out_size)\n        x2 = self._resize(x2, out_size)\n\n        x = self._binary_op(x1, x2)\n        if self.with_out_conv:\n            x = self.out_conv(x)\n        return x\n\n\nclass SumCell(BaseMergeCell):\n\n    def __init__(self, in_channels: int, out_channels: int, **kwargs):\n        super().__init__(in_channels, out_channels, **kwargs)\n\n    def _binary_op(self, x1, x2):\n        return x1 + x2\n\n\nclass ConcatCell(BaseMergeCell):\n\n    def __init__(self, in_channels: int, out_channels: int, **kwargs):\n        super().__init__(in_channels * 2, out_channels, **kwargs)\n\n    def _binary_op(self, x1, x2):\n        ret = torch.cat([x1, x2], dim=1)\n        return ret\n\n\nclass GlobalPoolingCell(BaseMergeCell):\n\n    def __init__(self,\n                 in_channels: Optional[int] = None,\n                 out_channels: Optional[int] = None,\n                 **kwargs):\n        super().__init__(in_channels, out_channels, **kwargs)\n        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))\n\n    def _binary_op(self, x1, x2):\n        x2_att = self.global_pool(x2).sigmoid()\n        return x2 + x2_att * x1\n"
  },
  {
    "path": "mmcv/ops/min_area_polygons.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['min_area_polygons'])\n\n\ndef min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:\n    \"\"\"Find the smallest polygons that surrounds all points in the point sets.\n\n    Args:\n        pointsets (Tensor): point sets with shape  (N, 18).\n\n    Returns:\n        torch.Tensor: Return the smallest polygons with shape (N, 8).\n    \"\"\"\n    polygons = pointsets.new_zeros((pointsets.size(0), 8))\n    ext_module.min_area_polygons(pointsets, polygons)\n    return polygons\n"
  },
  {
    "path": "mmcv/ops/modulated_deform_conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nfrom typing import Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.logging import print_log\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import deprecated_api_warning\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair, _single\n\nfrom mmcv.utils import IS_MLU_AVAILABLE\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext',\n    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])\n\n\nclass ModulatedDeformConv2dFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, offset, mask, weight, bias, stride, padding,\n                 dilation, groups, deform_groups):\n        input_tensors = [input, offset, mask, weight]\n        if bias is not None:\n            input_tensors.append(bias)\n        return g.op(\n            'mmcv::MMCVModulatedDeformConv2d',\n            *input_tensors,\n            stride_i=stride,\n            padding_i=padding,\n            dilation_i=dilation,\n            groups_i=groups,\n            deform_groups_i=deform_groups)\n\n    @staticmethod\n    def _calculate_sort_index(kernel_h, kernel_w, deformable_group):\n        split_num = deformable_group * 2 * kernel_h * kernel_w\n        sort_index = list(range(split_num))\n        sort_index_fp = (sort_index[1::2] + sort_index[::2])\n        sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}\n        sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]\n        sort_index_fp = torch.IntTensor(sort_index_fp)\n        sort_index_bp = torch.IntTensor(sort_index_bp)\n        sort_index_fp = sort_index_fp.npu()\n        sort_index_bp = sort_index_bp.npu()\n        return sort_index_fp, sort_index_bp\n\n    @staticmethod\n    def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):\n        _, _, kernel_h, kernel_w = weight.shape\n        conv2d_bias = bias if len(bias) > 0 else None\n        sort_index_fp, sort_index_bp = \\\n            ModulatedDeformConv2dFunction._calculate_sort_index(\n                kernel_h, kernel_w, ctx.deform_groups)\n        select_offset = offset.index_select(1, sort_index_fp)\n        offset_all = torch.cat([select_offset, mask], dim=1)\n        import torch_npu\n        output, offset_out = torch_npu.npu_deformable_conv2d(\n            input_tensor,\n            weight,\n            offset_all,\n            conv2d_bias,\n            kernel_size=[kernel_h, kernel_w],\n            stride=[1, 1, ctx.stride[0], ctx.stride[1]],\n            padding=[\n                ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]\n            ],\n            dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],\n            groups=ctx.groups,\n            deformable_groups=ctx.deform_groups,\n            modulated=True)\n        if weight.requires_grad or mask.requires_grad or offset.requires_grad \\\n                or input_tensor.requires_grad:\n            ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,\n                                  sort_index_bp)\n        return output\n\n    @staticmethod\n    def _npu_backward(ctx, grad_output):\n        input_tensor, weight, offset_out, offset_all, sort_index_bp = \\\n            ctx.saved_tensors\n        import torch_npu\n        grad_input, grad_weight, grad_offset_all, grad_bias = \\\n            torch_npu.npu_deformable_conv2dbk(\n                input_tensor, grad_output, offset_out, weight, offset_all,\n                kernel_size=[weight.shape[2], weight.shape[3]],\n                stride=[1, 1, ctx.stride[0], ctx.stride[1]],\n                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],\n                         ctx.padding[1]],\n                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],\n                groups=ctx.groups, deformable_groups=ctx.deform_groups,\n                modulated=True)\n        grad_offset = grad_offset_all.index_select(1, sort_index_bp)\n        grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]\n        if not ctx.with_bias:\n            grad_bias = None\n        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\n                None, None, None, None, None, None, None, None)\n\n    @staticmethod\n    def forward(ctx,\n                input: torch.Tensor,\n                offset: torch.Tensor,\n                mask: torch.Tensor,\n                weight: nn.Parameter,\n                bias: Optional[nn.Parameter] = None,\n                stride: int = 1,\n                padding: int = 0,\n                dilation: int = 1,\n                groups: int = 1,\n                deform_groups: int = 1) -> torch.Tensor:\n        if input is not None and input.dim() != 4:\n            raise ValueError(\n                f'Expected 4D tensor as input, got {input.dim()}D tensor \\\n                  instead.')\n        ctx.stride = _pair(stride)\n        ctx.padding = _pair(padding)\n        ctx.dilation = _pair(dilation)\n        ctx.groups = groups\n        ctx.deform_groups = deform_groups\n        ctx.with_bias = bias is not None\n        ctx.device = input.device.type\n        if not ctx.with_bias:\n            bias = input.new_empty(0)  # fake tensor\n        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;\n        # amp won't cast the type of model (float32), but \"offset\" is cast\n        # to float16 by nn.Conv2d automatically, leading to the type\n        # mismatch with input (when it is float32) or weight.\n        # The flag for whether to use fp16 or amp is the type of \"offset\",\n        # we cast weight and input to temporarily support fp16 and amp\n        # whatever the pytorch version is.\n        input = input.type_as(offset)\n        weight = weight.type_as(input)\n        bias = bias.type_as(input)  # type: ignore\n        mask = mask.type_as(input)\n        if ctx.device == 'npu':\n            output = ModulatedDeformConv2dFunction._npu_forward(\n                ctx, input, offset, mask, weight, bias)\n            return output\n        ctx.save_for_backward(input, offset, mask, weight, bias)\n        output = input.new_empty([\n            int(i) for i in ModulatedDeformConv2dFunction._output_size(\n                ctx, input, weight)\n        ])\n        ctx._bufs = [input.new_empty(0), input.new_empty(0)]\n        ext_module.modulated_deform_conv_forward(\n            input,\n            weight,\n            bias,\n            ctx._bufs[0],\n            offset,\n            mask,\n            output,\n            ctx._bufs[1],\n            kernel_h=weight.size(2),\n            kernel_w=weight.size(3),\n            stride_h=ctx.stride[0],\n            stride_w=ctx.stride[1],\n            pad_h=ctx.padding[0],\n            pad_w=ctx.padding[1],\n            dilation_h=ctx.dilation[0],\n            dilation_w=ctx.dilation[1],\n            group=ctx.groups,\n            deformable_group=ctx.deform_groups,\n            with_bias=ctx.with_bias)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        if ctx.device == 'npu':\n            return ModulatedDeformConv2dFunction._npu_backward(\n                ctx, grad_output)\n        input, offset, mask, weight, bias = ctx.saved_tensors\n        grad_input = torch.zeros_like(input)\n        grad_offset = torch.zeros_like(offset)\n        grad_mask = torch.zeros_like(mask)\n        grad_weight = torch.zeros_like(weight)\n        grad_bias = torch.zeros_like(bias)\n        grad_output = grad_output.contiguous()\n        ext_module.modulated_deform_conv_backward(\n            input,\n            weight,\n            bias,\n            ctx._bufs[0],\n            offset,\n            mask,\n            ctx._bufs[1],\n            grad_input,\n            grad_weight,\n            grad_bias,\n            grad_offset,\n            grad_mask,\n            grad_output,\n            kernel_h=weight.size(2),\n            kernel_w=weight.size(3),\n            stride_h=ctx.stride[0],\n            stride_w=ctx.stride[1],\n            pad_h=ctx.padding[0],\n            pad_w=ctx.padding[1],\n            dilation_h=ctx.dilation[0],\n            dilation_w=ctx.dilation[1],\n            group=ctx.groups,\n            deformable_group=ctx.deform_groups,\n            with_bias=ctx.with_bias)\n        if not ctx.with_bias:\n            grad_bias = None\n\n        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\n                None, None, None, None, None)\n\n    @staticmethod\n    def _output_size(ctx, input, weight):\n        channels = weight.size(0)\n        output_size = (input.size(0), channels)\n        for d in range(input.dim() - 2):\n            in_size = input.size(d + 2)\n            pad = ctx.padding[d]\n            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1\n            stride_ = ctx.stride[d]\n            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )\n        if not all(map(lambda s: s > 0, output_size)):\n            raise ValueError(\n                'convolution input is too small (output would be ' +\n                'x'.join(map(str, output_size)) + ')')\n        return output_size\n\n\nmodulated_deform_conv2d = ModulatedDeformConv2dFunction.apply\n\n\nclass ModulatedDeformConv2d(nn.Module):\n\n    @deprecated_api_warning({'deformable_groups': 'deform_groups'},\n                            cls_name='ModulatedDeformConv2d')\n    def __init__(self,\n                 in_channels: int,\n                 out_channels: int,\n                 kernel_size: Union[int, Tuple[int]],\n                 stride: int = 1,\n                 padding: int = 0,\n                 dilation: int = 1,\n                 groups: int = 1,\n                 deform_groups: int = 1,\n                 bias: Union[bool, str] = True):\n        super().__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.kernel_size = _pair(kernel_size)\n        self.stride = _pair(stride)\n        self.padding = _pair(padding)\n        self.dilation = _pair(dilation)\n        self.groups = groups\n        self.deform_groups = deform_groups\n        # enable compatibility with nn.Conv2d\n        self.transposed = False\n        self.output_padding = _single(0)\n\n        self.weight = nn.Parameter(\n            torch.Tensor(out_channels, in_channels // groups,\n                         *self.kernel_size))\n        if bias:\n            self.bias = nn.Parameter(torch.Tensor(out_channels))\n        else:\n            self.register_parameter('bias', None)\n        self.init_weights()\n\n    def init_weights(self):\n        n = self.in_channels\n        for k in self.kernel_size:\n            n *= k\n        stdv = 1. / math.sqrt(n)\n        self.weight.data.uniform_(-stdv, stdv)\n        if self.bias is not None:\n            self.bias.data.zero_()\n\n    def forward(self, x: torch.Tensor, offset: torch.Tensor,\n                mask: torch.Tensor) -> torch.Tensor:\n        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,\n                                       self.stride, self.padding,\n                                       self.dilation, self.groups,\n                                       self.deform_groups)\n\n\n@MODELS.register_module('DCNv2')\nclass ModulatedDeformConv2dPack(ModulatedDeformConv2d):\n    \"\"\"A ModulatedDeformable Conv Encapsulation that acts as normal Conv\n    layers.\n\n    Args:\n        in_channels (int): Same as nn.Conv2d.\n        out_channels (int): Same as nn.Conv2d.\n        kernel_size (int or tuple[int]): Same as nn.Conv2d.\n        stride (int): Same as nn.Conv2d, while tuple is not supported.\n        padding (int): Same as nn.Conv2d, while tuple is not supported.\n        dilation (int): Same as nn.Conv2d, while tuple is not supported.\n        groups (int): Same as nn.Conv2d.\n        bias (bool or str): If specified as `auto`, it will be decided by the\n            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise\n            False.\n    \"\"\"\n\n    _version = 2\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.conv_offset = nn.Conv2d(\n            self.in_channels,\n            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],\n            kernel_size=self.kernel_size,\n            stride=self.stride,\n            padding=self.padding,\n            dilation=self.dilation,\n            bias=True)\n        self.init_weights()\n\n    def init_weights(self) -> None:\n        super().init_weights()\n        if hasattr(self, 'conv_offset'):\n            self.conv_offset.weight.data.zero_()\n            self.conv_offset.bias.data.zero_()\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore\n        out = self.conv_offset(x)\n        o1, o2, mask = torch.chunk(out, 3, dim=1)\n        offset = torch.cat((o1, o2), dim=1)\n        mask = torch.sigmoid(mask)\n        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,\n                                       self.stride, self.padding,\n                                       self.dilation, self.groups,\n                                       self.deform_groups)\n\n    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,\n                              missing_keys, unexpected_keys, error_msgs):\n        version = local_metadata.get('version', None)\n\n        if version is None or version < 2:\n            # the key is different in early versions\n            # In version < 2, ModulatedDeformConvPack\n            # loads previous benchmark models.\n            if (prefix + 'conv_offset.weight' not in state_dict\n                    and prefix[:-1] + '_offset.weight' in state_dict):\n                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(\n                    prefix[:-1] + '_offset.weight')\n            if (prefix + 'conv_offset.bias' not in state_dict\n                    and prefix[:-1] + '_offset.bias' in state_dict):\n                state_dict[prefix +\n                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +\n                                                                '_offset.bias')\n\n        if version is not None and version > 1:\n            print_log(\n                f'ModulatedDeformConvPack {prefix.rstrip(\".\")} is upgraded to '\n                'version 2.',\n                logger='current')\n\n        super()._load_from_state_dict(state_dict, prefix, local_metadata,\n                                      strict, missing_keys, unexpected_keys,\n                                      error_msgs)\n\n\nif IS_MLU_AVAILABLE:\n    import torchvision\n    from mmengine.utils import digit_version\n    from torchvision.ops import deform_conv2d as tv_deform_conv2d\n\n    @MODELS.register_module('DCNv2', force=True)\n    class ModulatedDeformConv2dPack_MLU(ModulatedDeformConv2d):\n        \"\"\"This class is the DCNv2 implementation of the MLU device.\n\n        The MLU backend support of the operator has been implemented\n        in torchvision. The mmcv registration mechanism is used for\n        multiplexing here. The torchvision implementation of DCNv2 is called.\n        Args:\n            in_channels (int): Same as nn.Conv2d.\n            out_channels (int): Same as nn.Conv2d.\n            kernel_size (int or tuple[int]): Same as nn.Conv2d.\n            stride (int): Same as nn.Conv2d, while tuple is not supported.\n            padding (int): Same as nn.Conv2d, while tuple is not supported.\n            dilation (int): Same as nn.Conv2d, while tuple is not supported.\n            groups (int): Same as nn.Conv2d.\n            bias (bool or str): If specified as `auto`, it will be decided by\n                the norm_cfg. Bias will be set as True if norm_cfg is None,\n                otherwise False.\n        \"\"\"\n\n        def __init__(self, *args, **kwargs):\n            assert digit_version(torchvision.__version__) >= digit_version(\n                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'\n            super().__init__(*args, **kwargs)\n            self.conv_offset = nn.Conv2d(\n                self.in_channels,\n                self.deform_groups * 3 * self.kernel_size[0] *\n                self.kernel_size[1],\n                kernel_size=self.kernel_size,\n                stride=self.stride,\n                padding=self.padding,\n                dilation=self.dilation,\n                bias=True)\n            self.init_weights()\n\n        def init_weights(self):\n            super().init_weights()\n            if hasattr(self, 'conv_offset'):\n                self.conv_offset.weight.data.zero_()\n                self.conv_offset.bias.data.zero_()\n\n        def forward(self, x):\n            out = self.conv_offset(x)\n            o1, o2, mask = torch.chunk(out, 3, dim=1)\n            offset = torch.cat((o1, o2), dim=1)\n            mask = torch.sigmoid(mask)\n            x = x.type_as(offset)\n            weight = self.weight.type_as(x)\n            mask = mask.type_as(x)\n            return tv_deform_conv2d(\n                x,\n                offset,\n                weight,\n                bias=self.bias,\n                stride=self.stride,\n                padding=self.padding,\n                dilation=self.dilation,\n                mask=mask)\n"
  },
  {
    "path": "mmcv/ops/multi_scale_deform_attn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport math\nimport warnings\nfrom typing import Optional, no_type_check\n\nimport mmengine\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.model import BaseModule, constant_init, xavier_init\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import deprecated_api_warning\nfrom torch.autograd.function import Function, once_differentiable\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])\n\n\nclass MultiScaleDeformableAttnFunction(Function):\n\n    @staticmethod\n    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,\n                value_level_start_index: torch.Tensor,\n                sampling_locations: torch.Tensor,\n                attention_weights: torch.Tensor,\n                im2col_step: torch.Tensor) -> torch.Tensor:\n        \"\"\"GPU/MLU version of multi-scale deformable attention.\n\n        Args:\n            value (torch.Tensor): The value has shape\n                (bs, num_keys, mum_heads, embed_dims//num_heads)\n            value_spatial_shapes (torch.Tensor): Spatial shape of\n                each feature map, has shape (num_levels, 2),\n                last dimension 2 represent (h, w)\n            sampling_locations (torch.Tensor): The location of sampling points,\n                has shape\n                (bs ,num_queries, num_heads, num_levels, num_points, 2),\n                the last dimension 2 represent (x, y).\n            attention_weights (torch.Tensor): The weight of sampling points\n                used when calculate the attention, has shape\n                (bs ,num_queries, num_heads, num_levels, num_points),\n            im2col_step (torch.Tensor): The step used in image to column.\n\n        Returns:\n            torch.Tensor: has shape (bs, num_queries, embed_dims)\n        \"\"\"\n\n        ctx.im2col_step = im2col_step\n\n        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;\n        # amp won't cast the type of sampling_locations, attention_weights\n        # (float32), but \"value\" is cast to float16, leading to the type\n        # mismatch with input (when it is float32) or weight.\n        # The flag for whether to use fp16 or amp is the type of \"value\",\n        # we cast sampling_locations and attention_weights to\n        # temporarily support fp16 and amp whatever the\n        # pytorch version is.\n        sampling_locations = sampling_locations.type_as(value)\n        attention_weights = attention_weights.type_as(value)\n\n        output = ext_module.ms_deform_attn_forward(\n            value,\n            value_spatial_shapes,\n            value_level_start_index,\n            sampling_locations,\n            attention_weights,\n            im2col_step=ctx.im2col_step)\n        ctx.save_for_backward(value, value_spatial_shapes,\n                              value_level_start_index, sampling_locations,\n                              attention_weights)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output: torch.Tensor) -> tuple:\n        \"\"\"GPU/MLU version of backward function.\n\n        Args:\n            grad_output (torch.Tensor): Gradient of output tensor of forward.\n\n        Returns:\n            tuple[Tensor]: Gradient of input tensors in forward.\n        \"\"\"\n        value, value_spatial_shapes, value_level_start_index, \\\n            sampling_locations, attention_weights = ctx.saved_tensors\n        grad_value = torch.zeros_like(value)\n        grad_sampling_loc = torch.zeros_like(sampling_locations)\n        grad_attn_weight = torch.zeros_like(attention_weights)\n\n        ext_module.ms_deform_attn_backward(\n            value,\n            value_spatial_shapes,\n            value_level_start_index,\n            sampling_locations,\n            attention_weights,\n            grad_output.contiguous(),\n            grad_value,\n            grad_sampling_loc,\n            grad_attn_weight,\n            im2col_step=ctx.im2col_step)\n\n        return grad_value, None, None, \\\n            grad_sampling_loc, grad_attn_weight, None\n\n\ndef multi_scale_deformable_attn_pytorch(\n        value: torch.Tensor, value_spatial_shapes: torch.Tensor,\n        sampling_locations: torch.Tensor,\n        attention_weights: torch.Tensor) -> torch.Tensor:\n    \"\"\"CPU version of multi-scale deformable attention.\n\n    Args:\n        value (torch.Tensor): The value has shape\n            (bs, num_keys, num_heads, embed_dims//num_heads)\n        value_spatial_shapes (torch.Tensor): Spatial shape of\n            each feature map, has shape (num_levels, 2),\n            last dimension 2 represent (h, w)\n        sampling_locations (torch.Tensor): The location of sampling points,\n            has shape\n            (bs ,num_queries, num_heads, num_levels, num_points, 2),\n            the last dimension 2 represent (x, y).\n        attention_weights (torch.Tensor): The weight of sampling points used\n            when calculate the attention, has shape\n            (bs ,num_queries, num_heads, num_levels, num_points),\n\n    Returns:\n        torch.Tensor: has shape (bs, num_queries, embed_dims)\n    \"\"\"\n\n    bs, _, num_heads, embed_dims = value.shape\n    _, num_queries, num_heads, num_levels, num_points, _ =\\\n        sampling_locations.shape\n    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],\n                             dim=1)\n    sampling_grids = 2 * sampling_locations - 1\n    sampling_value_list = []\n    for level, (H_, W_) in enumerate(value_spatial_shapes):\n        # bs, H_*W_, num_heads, embed_dims ->\n        # bs, H_*W_, num_heads*embed_dims ->\n        # bs, num_heads*embed_dims, H_*W_ ->\n        # bs*num_heads, embed_dims, H_, W_\n        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(\n            bs * num_heads, embed_dims, H_, W_)\n        # bs, num_queries, num_heads, num_points, 2 ->\n        # bs, num_heads, num_queries, num_points, 2 ->\n        # bs*num_heads, num_queries, num_points, 2\n        sampling_grid_l_ = sampling_grids[:, :, :,\n                                          level].transpose(1, 2).flatten(0, 1)\n        # bs*num_heads, embed_dims, num_queries, num_points\n        sampling_value_l_ = F.grid_sample(\n            value_l_,\n            sampling_grid_l_,\n            mode='bilinear',\n            padding_mode='zeros',\n            align_corners=False)\n        sampling_value_list.append(sampling_value_l_)\n    # (bs, num_queries, num_heads, num_levels, num_points) ->\n    # (bs, num_heads, num_queries, num_levels, num_points) ->\n    # (bs, num_heads, 1, num_queries, num_levels*num_points)\n    attention_weights = attention_weights.transpose(1, 2).reshape(\n        bs * num_heads, 1, num_queries, num_levels * num_points)\n    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *\n              attention_weights).sum(-1).view(bs, num_heads * embed_dims,\n                                              num_queries)\n    return output.transpose(1, 2).contiguous()\n\n\n@MODELS.register_module()\nclass MultiScaleDeformableAttention(BaseModule):\n    \"\"\"An attention module used in Deformable-Detr.\n\n    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.\n    <https://arxiv.org/pdf/2010.04159.pdf>`_.\n\n    Args:\n        embed_dims (int): The embedding dimension of Attention.\n            Default: 256.\n        num_heads (int): Parallel attention heads. Default: 8.\n        num_levels (int): The number of feature map used in\n            Attention. Default: 4.\n        num_points (int): The number of sampling points for\n            each query in each head. Default: 4.\n        im2col_step (int): The step used in image_to_column.\n            Default: 64.\n        dropout (float): A Dropout layer on `inp_identity`.\n            Default: 0.1.\n        batch_first (bool): Key, Query and Value are shape of\n            (batch, n, embed_dim)\n            or (n, batch, embed_dim). Default to False.\n        norm_cfg (dict): Config dict for normalization layer.\n            Default: None.\n        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.\n            Default: None.\n        value_proj_ratio (float): The expansion ratio of value_proj.\n            Default: 1.0.\n    \"\"\"\n\n    def __init__(self,\n                 embed_dims: int = 256,\n                 num_heads: int = 8,\n                 num_levels: int = 4,\n                 num_points: int = 4,\n                 im2col_step: int = 64,\n                 dropout: float = 0.1,\n                 batch_first: bool = False,\n                 norm_cfg: Optional[dict] = None,\n                 init_cfg: Optional[mmengine.ConfigDict] = None,\n                 value_proj_ratio: float = 1.0):\n        super().__init__(init_cfg)\n        if embed_dims % num_heads != 0:\n            raise ValueError(f'embed_dims must be divisible by num_heads, '\n                             f'but got {embed_dims} and {num_heads}')\n        dim_per_head = embed_dims // num_heads\n        self.norm_cfg = norm_cfg\n        self.dropout = nn.Dropout(dropout)\n        self.batch_first = batch_first\n\n        # you'd better set dim_per_head to a power of 2\n        # which is more efficient in the CUDA/MUSA implementation\n        def _is_power_of_2(n):\n            if (not isinstance(n, int)) or (n < 0):\n                raise ValueError(\n                    'invalid input for _is_power_of_2: {} (type: {})'.format(\n                        n, type(n)))\n            return (n & (n - 1) == 0) and n != 0\n\n        if not _is_power_of_2(dim_per_head):\n            warnings.warn(\n                \"You'd better set embed_dims in \"\n                'MultiScaleDeformAttention to make '\n                'the dimension of each attention head a power of 2 '\n                'which is more efficient in our CUDA/MUSA implementation.')\n\n        self.im2col_step = im2col_step\n        self.embed_dims = embed_dims\n        self.num_levels = num_levels\n        self.num_heads = num_heads\n        self.num_points = num_points\n        self.sampling_offsets = nn.Linear(\n            embed_dims, num_heads * num_levels * num_points * 2)\n        self.attention_weights = nn.Linear(embed_dims,\n                                           num_heads * num_levels * num_points)\n        value_proj_size = int(embed_dims * value_proj_ratio)\n        self.value_proj = nn.Linear(embed_dims, value_proj_size)\n        self.output_proj = nn.Linear(value_proj_size, embed_dims)\n        self.init_weights()\n\n    def init_weights(self) -> None:\n        \"\"\"Default initialization for Parameters of Module.\"\"\"\n        constant_init(self.sampling_offsets, 0.)\n        device = next(self.parameters()).device\n        thetas = torch.arange(\n            self.num_heads, dtype=torch.float32,\n            device=device) * (2.0 * math.pi / self.num_heads)\n        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)\n        grid_init = (grid_init /\n                     grid_init.abs().max(-1, keepdim=True)[0]).view(\n                         self.num_heads, 1, 1,\n                         2).repeat(1, self.num_levels, self.num_points, 1)\n        for i in range(self.num_points):\n            grid_init[:, :, i, :] *= i + 1\n\n        self.sampling_offsets.bias.data = grid_init.view(-1)\n        constant_init(self.attention_weights, val=0., bias=0.)\n        xavier_init(self.value_proj, distribution='uniform', bias=0.)\n        xavier_init(self.output_proj, distribution='uniform', bias=0.)\n        self._is_init = True\n\n    @no_type_check\n    @deprecated_api_warning({'residual': 'identity'},\n                            cls_name='MultiScaleDeformableAttention')\n    def forward(self,\n                query: torch.Tensor,\n                key: Optional[torch.Tensor] = None,\n                value: Optional[torch.Tensor] = None,\n                identity: Optional[torch.Tensor] = None,\n                query_pos: Optional[torch.Tensor] = None,\n                key_padding_mask: Optional[torch.Tensor] = None,\n                reference_points: Optional[torch.Tensor] = None,\n                spatial_shapes: Optional[torch.Tensor] = None,\n                level_start_index: Optional[torch.Tensor] = None,\n                **kwargs) -> torch.Tensor:\n        \"\"\"Forward Function of MultiScaleDeformAttention.\n\n        Args:\n            query (torch.Tensor): Query of Transformer with shape\n                (num_query, bs, embed_dims).\n            key (torch.Tensor): The key tensor with shape\n                `(num_key, bs, embed_dims)`.\n            value (torch.Tensor): The value tensor with shape\n                `(num_key, bs, embed_dims)`.\n            identity (torch.Tensor): The tensor used for addition, with the\n                same shape as `query`. Default None. If None,\n                `query` will be used.\n            query_pos (torch.Tensor): The positional encoding for `query`.\n                Default: None.\n            key_padding_mask (torch.Tensor): ByteTensor for `query`, with\n                shape [bs, num_key].\n            reference_points (torch.Tensor):  The normalized reference\n                points with shape (bs, num_query, num_levels, 2),\n                all elements is range in [0, 1], top-left (0,0),\n                bottom-right (1, 1), including padding area.\n                or (N, Length_{query}, num_levels, 4), add\n                additional two dimensions is (w, h) to\n                form reference boxes.\n            spatial_shapes (torch.Tensor): Spatial shape of features in\n                different levels. With shape (num_levels, 2),\n                last dimension represents (h, w).\n            level_start_index (torch.Tensor): The start index of each level.\n                A tensor has shape ``(num_levels, )`` and can be represented\n                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].\n\n        Returns:\n            torch.Tensor: forwarded results with shape\n            [num_query, bs, embed_dims].\n        \"\"\"\n\n        if value is None:\n            value = query\n\n        if identity is None:\n            identity = query\n        if query_pos is not None:\n            query = query + query_pos\n        if not self.batch_first:\n            # change to (bs, num_query ,embed_dims)\n            query = query.permute(1, 0, 2)\n            value = value.permute(1, 0, 2)\n\n        bs, num_query, _ = query.shape\n        bs, num_value, _ = value.shape\n        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value\n\n        value = self.value_proj(value)\n        if key_padding_mask is not None:\n            value = value.masked_fill(key_padding_mask[..., None], 0.0)\n        value = value.view(bs, num_value, self.num_heads, -1)\n        sampling_offsets = self.sampling_offsets(query).view(\n            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)\n        attention_weights = self.attention_weights(query).view(\n            bs, num_query, self.num_heads, self.num_levels * self.num_points)\n        attention_weights = attention_weights.softmax(-1)\n\n        attention_weights = attention_weights.view(bs, num_query,\n                                                   self.num_heads,\n                                                   self.num_levels,\n                                                   self.num_points)\n        if reference_points.shape[-1] == 2:\n            offset_normalizer = torch.stack(\n                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)\n            sampling_locations = reference_points[:, :, None, :, None, :] \\\n                + sampling_offsets \\\n                / offset_normalizer[None, None, None, :, None, :]\n        elif reference_points.shape[-1] == 4:\n            sampling_locations = reference_points[:, :, None, :, None, :2] \\\n                + sampling_offsets / self.num_points \\\n                * reference_points[:, :, None, :, None, 2:] \\\n                * 0.5\n        else:\n            raise ValueError(\n                f'Last dim of reference_points must be'\n                f' 2 or 4, but get {reference_points.shape[-1]} instead.')\n        if ((IS_CUDA_AVAILABLE and value.is_cuda)\n                or (IS_MLU_AVAILABLE and value.is_mlu)\n                or (IS_MUSA_AVAILABLE and value.is_musa)\n                or (IS_NPU_AVAILABLE and value.device.type == 'npu')):\n            output = MultiScaleDeformableAttnFunction.apply(\n                value, spatial_shapes, level_start_index, sampling_locations,\n                attention_weights, self.im2col_step)\n        else:\n            output = multi_scale_deformable_attn_pytorch(\n                value, spatial_shapes, sampling_locations, attention_weights)\n\n        output = self.output_proj(output)\n\n        if not self.batch_first:\n            # (num_query, bs ,embed_dims)\n            output = output.permute(1, 0, 2)\n\n        return self.dropout(output) + identity\n"
  },
  {
    "path": "mmcv/ops/nms.py",
    "content": "from typing import Any, Dict, List, Optional, Tuple, Union\n\nimport numpy as np\nimport torch\nfrom mmengine.utils import deprecated_api_warning\nfrom torch import Tensor\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated', 'nms_quadri'])\n\n\n# This function is modified from: https://github.com/pytorch/vision/\nclass NMSop(torch.autograd.Function):\n\n    @staticmethod\n    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,\n                offset: int, score_threshold: float, max_num: int) -> Tensor:\n        is_filtering_by_score = score_threshold > 0\n        if is_filtering_by_score:\n            valid_mask = scores > score_threshold\n            bboxes, scores = bboxes[valid_mask], scores[valid_mask]\n            valid_inds = torch.nonzero(\n                valid_mask, as_tuple=False).squeeze(dim=1)\n\n        inds = ext_module.nms(\n            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)\n\n        if max_num > 0:\n            inds = inds[:max_num]\n        if is_filtering_by_score:\n            inds = valid_inds[inds]\n        return inds\n\n\nclass SoftNMSop(torch.autograd.Function):\n\n    @staticmethod\n    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,\n                sigma: float, min_score: float, method: int,\n                offset: int) -> Tuple[Tensor, Tensor]:\n        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')\n        inds = ext_module.softnms(\n            boxes.cpu(),\n            scores.cpu(),\n            dets.cpu(),\n            iou_threshold=float(iou_threshold),\n            sigma=float(sigma),\n            min_score=float(min_score),\n            method=int(method),\n            offset=int(offset))\n        return dets, inds\n\n    @staticmethod\n    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,\n                 offset):\n        from packaging import version\n        assert version.parse(torch.__version__) >= version.parse('1.7.0')\n        nms_out = g.op(\n            'mmcv::SoftNonMaxSuppression',\n            boxes,\n            scores,\n            iou_threshold_f=float(iou_threshold),\n            sigma_f=float(sigma),\n            min_score_f=float(min_score),\n            method_i=int(method),\n            offset_i=int(offset),\n            outputs=2)\n        return nms_out\n\n\narray_like_type = Union[Tensor, np.ndarray]\n\n\n@deprecated_api_warning({'iou_thr': 'iou_threshold'})\ndef nms(boxes: array_like_type,\n        scores: array_like_type,\n        iou_threshold: float,\n        offset: int = 0,\n        score_threshold: float = 0,\n        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:\n    \"\"\"Dispatch to either CPU or GPU NMS implementations.\n\n    The input can be either torch tensor or numpy array. GPU NMS will be used\n    if the input is gpu tensor, otherwise CPU NMS\n    will be used. The returned type will always be the same as inputs.\n\n    Arguments:\n        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).\n        scores (torch.Tensor or np.ndarray): scores in shape (N, ).\n        iou_threshold (float): IoU threshold for NMS.\n        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).\n        score_threshold (float): score threshold for NMS.\n        max_num (int): maximum number of boxes after NMS.\n\n    Returns:\n        tuple: kept dets (boxes and scores) and indice, which always have\n        the same data type as the input.\n\n    Example:\n        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],\n        >>>                   [49.3, 32.9, 51.0, 35.3],\n        >>>                   [49.2, 31.8, 51.0, 35.4],\n        >>>                   [35.1, 11.5, 39.1, 15.7],\n        >>>                   [35.6, 11.8, 39.3, 14.2],\n        >>>                   [35.3, 11.5, 39.9, 14.5],\n        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)\n        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\\\n               dtype=np.float32)\n        >>> iou_threshold = 0.6\n        >>> dets, inds = nms(boxes, scores, iou_threshold)\n        >>> assert len(inds) == len(dets) == 3\n    \"\"\"\n    assert isinstance(boxes, (Tensor, np.ndarray))\n    assert isinstance(scores, (Tensor, np.ndarray))\n    is_numpy = False\n    if isinstance(boxes, np.ndarray):\n        is_numpy = True\n        boxes = torch.from_numpy(boxes)\n    if isinstance(scores, np.ndarray):\n        scores = torch.from_numpy(scores)\n    assert boxes.size(1) == 4\n    assert boxes.size(0) == scores.size(0)\n    assert offset in (0, 1)\n\n    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,\n                       max_num)\n    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)\n    if is_numpy:\n        dets = dets.cpu().numpy()\n        inds = inds.cpu().numpy()\n    return dets, inds\n\n\n@deprecated_api_warning({'iou_thr': 'iou_threshold'})\ndef soft_nms(boxes: array_like_type,\n             scores: array_like_type,\n             iou_threshold: float = 0.3,\n             sigma: float = 0.5,\n             min_score: float = 1e-3,\n             method: str = 'linear',\n             offset: int = 0) -> Tuple[array_like_type, array_like_type]:\n    \"\"\"Dispatch to only CPU Soft NMS implementations.\n\n    The input can be either a torch tensor or numpy array.\n    The returned type will always be the same as inputs.\n\n    Args:\n        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).\n        scores (torch.Tensor or np.ndarray): scores in shape (N, ).\n        iou_threshold (float): IoU threshold for NMS.\n        sigma (float): hyperparameter for gaussian method\n        min_score (float): score filter threshold\n        method (str): either 'linear' or 'gaussian'\n        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).\n\n    Returns:\n        tuple: kept dets (boxes and scores) and indice, which always have\n        the same data type as the input.\n\n    Example:\n        >>> boxes = np.array([[4., 3., 5., 3.],\n        >>>                   [4., 3., 5., 4.],\n        >>>                   [3., 1., 3., 1.],\n        >>>                   [3., 1., 3., 1.],\n        >>>                   [3., 1., 3., 1.],\n        >>>                   [3., 1., 3., 1.]], dtype=np.float32)\n        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)\n        >>> iou_threshold = 0.6\n        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)\n        >>> assert len(inds) == len(dets) == 5\n    \"\"\"\n\n    assert isinstance(boxes, (Tensor, np.ndarray))\n    assert isinstance(scores, (Tensor, np.ndarray))\n    is_numpy = False\n    if isinstance(boxes, np.ndarray):\n        is_numpy = True\n        boxes = torch.from_numpy(boxes)\n    if isinstance(scores, np.ndarray):\n        scores = torch.from_numpy(scores)\n    assert boxes.size(1) == 4\n    assert boxes.size(0) == scores.size(0)\n    assert offset in (0, 1)\n    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}\n    assert method in method_dict.keys()\n\n    if torch.__version__ == 'parrots':\n        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')\n        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]\n        indata_dict = {\n            'iou_threshold': float(iou_threshold),\n            'sigma': float(sigma),\n            'min_score': min_score,\n            'method': method_dict[method],\n            'offset': int(offset)\n        }\n        inds = ext_module.softnms(*indata_list, **indata_dict)\n    else:\n        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),\n                                     float(iou_threshold), float(sigma),\n                                     float(min_score), method_dict[method],\n                                     int(offset))\n\n    dets = dets[:inds.size(0)]\n\n    if is_numpy:\n        dets = dets.cpu().numpy()\n        inds = inds.cpu().numpy()\n        return dets, inds\n    else:\n        return dets.to(device=boxes.device), inds.to(device=boxes.device)\n\n\ndef batched_nms(boxes: Tensor,\n                scores: Tensor,\n                idxs: Tensor,\n                nms_cfg: Optional[Dict],\n                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:\n    r\"\"\"Performs non-maximum suppression in a batched fashion.\n\n    Modified from `torchvision/ops/boxes.py#L39\n    <https://github.com/pytorch/vision/blob/\n    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.\n    In order to perform NMS independently per class, we add an offset to all\n    the boxes. The offset is dependent only on the class idx, and is large\n    enough so that boxes from different classes do not overlap.\n\n    Note:\n        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and\n        returns sorted raw results when `nms_cfg` is None.\n\n    Args:\n        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).\n        scores (torch.Tensor): scores in shape (N, ).\n        idxs (torch.Tensor): each index value correspond to a bbox cluster,\n            and NMS will not be applied between elements of different idxs,\n            shape (N, ).\n        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`\n            is None, otherwise it should specify nms type and other\n            parameters like `iou_thr`. Possible keys includes the following.\n\n            - iou_threshold (float): IoU threshold used for NMS.\n            - split_thr (float): threshold number of boxes. In some cases the\n              number of boxes is large (e.g., 200k). To avoid OOM during\n              training, the users could set `split_thr` to a small value.\n              If the number of boxes is greater than the threshold, it will\n              perform NMS on each group of boxes separately and sequentially.\n              Defaults to 10000.\n        class_agnostic (bool): if true, nms is class agnostic,\n            i.e. IoU thresholding happens over all boxes,\n            regardless of the predicted class. Defaults to False.\n\n    Returns:\n        tuple: kept dets and indice.\n\n        - boxes (Tensor): Bboxes with score after nms, has shape\n          (num_bboxes, 5). last dimension 5 arrange as\n          (x1, y1, x2, y2, score)\n        - keep (Tensor): The indices of remaining boxes in input\n          boxes.\n    \"\"\"\n    # skip nms when nms_cfg is None\n    if nms_cfg is None:\n        scores, inds = scores.sort(descending=True)\n        boxes = boxes[inds]\n        return torch.cat([boxes, scores[:, None]], -1), inds\n\n    nms_cfg_ = nms_cfg.copy()\n    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)\n    if class_agnostic:\n        boxes_for_nms = boxes\n    else:\n        # When using rotated boxes, only apply offsets on center.\n        if boxes.size(-1) == 5:\n            # Strictly, the maximum coordinates of the rotating box\n            # (x,y,w,h,a) should be calculated by polygon coordinates.\n            # But the conversion from rotated box to polygon will\n            # slow down the speed.\n            # So we use max(x,y) + max(w,h) as max coordinate\n            # which is larger than polygon max coordinate\n            # max(x1, y1, x2, y2,x3, y3, x4, y4)\n            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()\n            offsets = idxs.to(boxes) * (\n                max_coordinate + torch.tensor(1).to(boxes))\n            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]\n            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],\n                                      dim=-1)\n        else:\n            max_coordinate = boxes.max()\n            offsets = idxs.to(boxes) * (\n                max_coordinate + torch.tensor(1).to(boxes))\n            boxes_for_nms = boxes + offsets[:, None]\n\n    nms_op = nms_cfg_.pop('type', 'nms')\n    if isinstance(nms_op, str):\n        nms_op = eval(nms_op)\n\n    split_thr = nms_cfg_.pop('split_thr', 10000)\n    # Won't split to multiple nms nodes when exporting to onnx\n    if boxes_for_nms.shape[0] < split_thr:\n        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)\n        boxes = boxes[keep]\n\n        # This assumes `dets` has arbitrary dimensions where\n        # the last dimension is score.\n        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or\n        # rotated boxes [cx, cy, w, h, angle_radian, score].\n\n        scores = dets[:, -1]\n    else:\n        max_num = nms_cfg_.pop('max_num', -1)\n        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)\n        # Some type of nms would reweight the score, such as SoftNMS\n        scores_after_nms = scores.new_zeros(scores.size())\n        for id in torch.unique(idxs):\n            mask = (idxs == id).nonzero(as_tuple=False).view(-1)\n            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)\n            total_mask[mask[keep]] = True\n            scores_after_nms[mask[keep]] = dets[:, -1]\n        keep = total_mask.nonzero(as_tuple=False).view(-1)\n\n        scores, inds = scores_after_nms[keep].sort(descending=True)\n        keep = keep[inds]\n        boxes = boxes[keep]\n\n        if max_num > 0:\n            keep = keep[:max_num]\n            boxes = boxes[:max_num]\n            scores = scores[:max_num]\n\n    boxes = torch.cat([boxes, scores[:, None]], -1)\n    return boxes, keep\n\n\ndef nms_match(dets: array_like_type,\n              iou_threshold: float) -> List[array_like_type]:\n    \"\"\"Matched dets into different groups by NMS.\n\n    NMS match is Similar to NMS but when a bbox is suppressed, nms match will\n    record the indice of suppressed bbox and form a group with the indice of\n    kept bbox. In each group, indice is sorted as score order.\n\n    Args:\n        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).\n        iou_threshold (float): IoU thresh for NMS.\n\n    Returns:\n        list[torch.Tensor | np.ndarray]: The outer list corresponds different\n        matched group, the inner Tensor corresponds the indices for a group\n        in score order.\n    \"\"\"\n    if dets.shape[0] == 0:\n        matched = []\n    else:\n        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \\\n                                    f'but get {dets.shape}'\n        if isinstance(dets, Tensor):\n            dets_t = dets.detach().cpu()\n        else:\n            dets_t = torch.from_numpy(dets)\n        indata_list = [dets_t]\n        indata_dict = {'iou_threshold': float(iou_threshold)}\n        matched = ext_module.nms_match(*indata_list, **indata_dict)\n        if torch.__version__ == 'parrots':\n            matched = matched.tolist()  # type: ignore\n\n    if isinstance(dets, Tensor):\n        return [dets.new_tensor(m, dtype=torch.long) for m in matched]\n    else:\n        return [np.array(m, dtype=int) for m in matched]\n\n\ndef nms_rotated(dets: Tensor,\n                scores: Tensor,\n                iou_threshold: float,\n                labels: Optional[Tensor] = None,\n                clockwise: bool = True) -> Tuple[Tensor, Tensor]:\n    \"\"\"Performs non-maximum suppression (NMS) on the rotated boxes according to\n    their intersection-over-union (IoU).\n\n    Rotated NMS iteratively removes lower scoring rotated boxes which have an\n    IoU greater than iou_threshold with another (higher scoring) rotated box.\n\n    Args:\n        dets (torch.Tensor):  Rotated boxes in shape (N, 5).\n            They are expected to be in\n            (x_ctr, y_ctr, width, height, angle_radian) format.\n        scores (torch.Tensor): scores in shape (N, ).\n        iou_threshold (float): IoU thresh for NMS.\n        labels (torch.Tensor, optional): boxes' label in shape (N,).\n        clockwise (bool): flag indicating whether the positive angular\n            orientation is clockwise. default True.\n            `New in version 1.4.3.`\n\n    Returns:\n        tuple: kept dets(boxes and scores) and indice, which is always the\n        same data type as the input.\n    \"\"\"\n    if dets.shape[0] == 0:\n        return dets, None\n    if not clockwise:\n        flip_mat = dets.new_ones(dets.shape[-1])\n        flip_mat[-1] = -1\n        dets_cw = dets * flip_mat\n    else:\n        dets_cw = dets\n    multi_label = labels is not None\n    if labels is None:\n        input_labels = scores.new_empty(0, dtype=torch.int)\n    else:\n        input_labels = labels\n    if dets.device.type in ('npu', 'mlu'):\n        order = scores.new_empty(0, dtype=torch.long)\n        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,\n                                           input_labels, iou_threshold,\n                                           multi_label)\n        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),\n                         dim=1)\n        return dets, keep_inds\n\n    if multi_label:\n        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore\n    else:\n        dets_wl = dets_cw\n    _, order = scores.sort(0, descending=True)\n    dets_sorted = dets_wl.index_select(0, order)\n\n    if torch.__version__ == 'parrots':\n        keep_inds = ext_module.nms_rotated(\n            dets_wl,\n            scores,\n            order,\n            dets_sorted,\n            input_labels,\n            iou_threshold=iou_threshold,\n            multi_label=multi_label)\n    else:\n        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,\n                                           input_labels, iou_threshold,\n                                           multi_label)\n    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),\n                     dim=1)\n    return dets, keep_inds\n\n\ndef nms_quadri(dets: Tensor,\n               scores: Tensor,\n               iou_threshold: float,\n               labels: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:\n    \"\"\"Performs non-maximum suppression (NMS) on the quadrilateral boxes\n    according to their intersection-over-union (IoU).\n\n    Quadri NMS iteratively removes lower scoring quadrilateral boxes\n    which have an IoU greater than iou_threshold with another (higher\n    scoring) quadrilateral box.\n\n    Args:\n        dets (torch.Tensor):  Quadri boxes in shape (N, 8).\n            They are expected to be in\n            (x1, y1, ..., x4, y4) format.\n        scores (torch.Tensor): scores in shape (N, ).\n        iou_threshold (float): IoU thresh for NMS.\n        labels (torch.Tensor, optional): boxes' label in shape (N,).\n\n    Returns:\n        tuple: kept dets(boxes and scores) and indice, which is always the\n        same data type as the input.\n    \"\"\"\n    if dets.shape[0] == 0:\n        return dets, None\n\n    multi_label = labels is not None\n    if multi_label:\n        dets_with_lables = \\\n            torch.cat((dets, labels.unsqueeze(1)), 1)  # type: ignore\n    else:\n        dets_with_lables = dets\n    _, order = scores.sort(0, descending=True)\n    dets_sorted = dets_with_lables.index_select(0, order)\n\n    keep_inds = ext_module.nms_quadri(dets_with_lables, scores, order,\n                                      dets_sorted, iou_threshold, multi_label)\n    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),\n                     dim=1)\n    return dets, keep_inds\n"
  },
  {
    "path": "mmcv/ops/pixel_group.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import List, Union\n\nimport numpy as np\nimport torch\nfrom torch import Tensor\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['pixel_group'])\n\n\ndef pixel_group(\n    score: Union[np.ndarray, Tensor],\n    mask: Union[np.ndarray, Tensor],\n    embedding: Union[np.ndarray, Tensor],\n    kernel_label: Union[np.ndarray, Tensor],\n    kernel_contour: Union[np.ndarray, Tensor],\n    kernel_region_num: int,\n    distance_threshold: float,\n) -> List[List[float]]:\n    \"\"\"Group pixels into text instances, which is widely used text detection\n    methods.\n\n    Arguments:\n        score (np.array or torch.Tensor): The foreground score with size hxw.\n        mask (np.array or Tensor): The foreground mask with size hxw.\n        embedding (np.array or torch.Tensor): The embedding with size hxwxc to\n            distinguish instances.\n        kernel_label (np.array or torch.Tensor): The instance kernel index with\n            size hxw.\n        kernel_contour (np.array or torch.Tensor): The kernel contour with\n            size hxw.\n        kernel_region_num (int): The instance kernel region number.\n        distance_threshold (float): The embedding distance threshold between\n            kernel and pixel in one instance.\n\n    Returns:\n        list[list[float]]: The instance coordinates and attributes list. Each\n        element consists of averaged confidence, pixel number, and coordinates\n        (x_i, y_i for all pixels) in order.\n    \"\"\"\n    assert isinstance(score, (torch.Tensor, np.ndarray))\n    assert isinstance(mask, (torch.Tensor, np.ndarray))\n    assert isinstance(embedding, (torch.Tensor, np.ndarray))\n    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))\n    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))\n    assert isinstance(kernel_region_num, int)\n    assert isinstance(distance_threshold, float)\n\n    if isinstance(score, np.ndarray):\n        score = torch.from_numpy(score)\n    if isinstance(mask, np.ndarray):\n        mask = torch.from_numpy(mask)\n    if isinstance(embedding, np.ndarray):\n        embedding = torch.from_numpy(embedding)\n    if isinstance(kernel_label, np.ndarray):\n        kernel_label = torch.from_numpy(kernel_label)\n    if isinstance(kernel_contour, np.ndarray):\n        kernel_contour = torch.from_numpy(kernel_contour)\n\n    if torch.__version__ == 'parrots':\n        label = ext_module.pixel_group(\n            score,\n            mask,\n            embedding,\n            kernel_label,\n            kernel_contour,\n            kernel_region_num=kernel_region_num,\n            distance_threshold=distance_threshold)\n        label = label.tolist()\n        label = label[0]\n        list_index = kernel_region_num\n        pixel_assignment = []\n        for x in range(kernel_region_num):\n            pixel_assignment.append(\n                np.array(\n                    label[list_index:list_index + int(label[x])],\n                    dtype=np.float))\n            list_index = list_index + int(label[x])\n    else:\n        pixel_assignment = ext_module.pixel_group(score, mask, embedding,\n                                                  kernel_label, kernel_contour,\n                                                  kernel_region_num,\n                                                  distance_threshold)\n    return pixel_assignment\n"
  },
  {
    "path": "mmcv/ops/point_sample.py",
    "content": "# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa\n\nfrom typing import Tuple, Union\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom torch.nn.modules.utils import _pair\n\n\ndef bilinear_grid_sample(im: Tensor,\n                         grid: Tensor,\n                         align_corners: bool = False) -> Tensor:\n    \"\"\"Given an input and a flow-field grid, computes the output using input\n    values and pixel locations from grid. Supported only bilinear interpolation\n    method to sample the input pixels.\n\n    Args:\n        im (torch.Tensor): Input feature map, shape (N, C, H, W)\n        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)\n        align_corners (bool): If set to True, the extrema (-1 and 1) are\n            considered as referring to the center points of the input’s\n            corner pixels. If set to False, they are instead considered as\n            referring to the corner points of the input’s corner pixels,\n            making the sampling more resolution agnostic.\n\n    Returns:\n        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)\n    \"\"\"\n    n, c, h, w = im.shape\n    gn, gh, gw, _ = grid.shape\n    assert n == gn\n\n    x = grid[:, :, :, 0]\n    y = grid[:, :, :, 1]\n\n    if align_corners:\n        x = ((x + 1) / 2) * (w - 1)\n        y = ((y + 1) / 2) * (h - 1)\n    else:\n        x = ((x + 1) * w - 1) / 2\n        y = ((y + 1) * h - 1) / 2\n\n    x = x.view(n, -1)\n    y = y.view(n, -1)\n\n    x0 = torch.floor(x).long()\n    y0 = torch.floor(y).long()\n    x1 = x0 + 1\n    y1 = y0 + 1\n\n    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)\n    wb = ((x1 - x) * (y - y0)).unsqueeze(1)\n    wc = ((x - x0) * (y1 - y)).unsqueeze(1)\n    wd = ((x - x0) * (y - y0)).unsqueeze(1)\n\n    # Apply default for grid_sample function zero padding\n    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)\n    padded_h = h + 2\n    padded_w = w + 2\n    # save points positions after padding\n    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1\n\n    # Clip coordinates to padded image size\n    x0 = torch.where(x0 < 0, torch.tensor(0), x0)\n    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)\n    x1 = torch.where(x1 < 0, torch.tensor(0), x1)\n    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)\n    y0 = torch.where(y0 < 0, torch.tensor(0), y0)\n    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)\n    y1 = torch.where(y1 < 0, torch.tensor(0), y1)\n    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)\n\n    im_padded = im_padded.view(n, c, -1)\n\n    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)\n    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)\n    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)\n    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)\n\n    Ia = torch.gather(im_padded, 2, x0_y0)\n    Ib = torch.gather(im_padded, 2, x0_y1)\n    Ic = torch.gather(im_padded, 2, x1_y0)\n    Id = torch.gather(im_padded, 2, x1_y1)\n\n    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)\n\n\ndef normalize(grid: Tensor) -> Tensor:\n    \"\"\"Normalize input grid from [-1, 1] to [0, 1]\n\n    Args:\n        grid (torch.Tensor): The grid to be normalize, range [-1, 1].\n\n    Returns:\n        torch.Tensor: Normalized grid, range [0, 1].\n    \"\"\"\n\n    return (grid + 1.0) / 2.0\n\n\ndef denormalize(grid: Tensor) -> Tensor:\n    \"\"\"Denormalize input grid from range [0, 1] to [-1, 1]\n\n    Args:\n        grid (torch.Tensor): The grid to be denormalize, range [0, 1].\n\n    Returns:\n        torch.Tensor: Denormalized grid, range [-1, 1].\n    \"\"\"\n\n    return grid * 2.0 - 1.0\n\n\ndef generate_grid(num_grid: int, size: Tuple[int, int],\n                  device: torch.device) -> Tensor:\n    \"\"\"Generate regular square grid of points in [0, 1] x [0, 1] coordinate\n    space.\n\n    Args:\n        num_grid (int): The number of grids to sample, one for each region.\n        size (tuple[int, int]): The side size of the regular grid.\n        device (torch.device): Desired device of returned tensor.\n\n    Returns:\n        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that\n        contains coordinates for the regular grids.\n    \"\"\"\n\n    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)\n    grid = F.affine_grid(\n        affine_trans, torch.Size((1, 1, *size)), align_corners=False)\n    grid = normalize(grid)\n    return grid.view(1, -1, 2).expand(num_grid, -1, -1)\n\n\ndef rel_roi_point_to_abs_img_point(rois: Tensor,\n                                   rel_roi_points: Tensor) -> Tensor:\n    \"\"\"Convert roi based relative point coordinates to image based absolute\n    point coordinates.\n\n    Args:\n        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)\n        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative\n            to RoI, location, range (0, 1), shape (N, P, 2)\n    Returns:\n        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)\n    \"\"\"\n\n    with torch.no_grad():\n        assert rel_roi_points.size(0) == rois.size(0)\n        assert rois.dim() == 2\n        assert rel_roi_points.dim() == 3\n        assert rel_roi_points.size(2) == 2\n        # remove batch idx\n        if rois.size(1) == 5:\n            rois = rois[:, 1:]\n        abs_img_points = rel_roi_points.clone()\n        # To avoid an error during exporting to onnx use independent\n        # variables instead inplace computation\n        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])\n        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])\n        xs += rois[:, None, 0]\n        ys += rois[:, None, 1]\n        abs_img_points = torch.stack([xs, ys], dim=2)\n    return abs_img_points\n\n\ndef get_shape_from_feature_map(x: Tensor) -> Tensor:\n    \"\"\"Get spatial resolution of input feature map considering exporting to\n    onnx mode.\n\n    Args:\n        x (torch.Tensor): Input tensor, shape (N, C, H, W)\n\n    Returns:\n        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)\n    \"\"\"\n    img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1,\n                                                       2).to(x.device).float()\n    return img_shape\n\n\ndef abs_img_point_to_rel_img_point(abs_img_points: Tensor,\n                                   img: Union[tuple, Tensor],\n                                   spatial_scale: float = 1.) -> Tensor:\n    \"\"\"Convert image based absolute point coordinates to image based relative\n    coordinates for sampling.\n\n    Args:\n        abs_img_points (torch.Tensor): Image based absolute point coordinates,\n            shape (N, P, 2)\n        img (tuple or torch.Tensor): (height, width) of image or feature map.\n        spatial_scale (float, optional): Scale points by this factor.\n            Default: 1.\n\n    Returns:\n        Tensor: Image based relative point coordinates for sampling, shape\n        (N, P, 2).\n    \"\"\"\n\n    assert (isinstance(img, tuple) and len(img) == 2) or \\\n           (isinstance(img, torch.Tensor) and len(img.shape) == 4)\n\n    if isinstance(img, tuple):\n        h, w = img\n        scale = torch.tensor([w, h],\n                             dtype=torch.float,\n                             device=abs_img_points.device)\n        scale = scale.view(1, 1, 2)\n    else:\n        scale = get_shape_from_feature_map(img)\n\n    return abs_img_points / scale * spatial_scale\n\n\ndef rel_roi_point_to_rel_img_point(rois: Tensor,\n                                   rel_roi_points: Tensor,\n                                   img: Union[tuple, Tensor],\n                                   spatial_scale: float = 1.) -> Tensor:\n    \"\"\"Convert roi based relative point coordinates to image based absolute\n    point coordinates.\n\n    Args:\n        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)\n        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative\n            to RoI, location, range (0, 1), shape (N, P, 2)\n        img (tuple or torch.Tensor): (height, width) of image or feature map.\n        spatial_scale (float, optional): Scale points by this factor.\n            Default: 1.\n\n    Returns:\n        torch.Tensor: Image based relative point coordinates for sampling,\n        shape (N, P, 2).\n    \"\"\"\n\n    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)\n    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,\n                                                   spatial_scale)\n\n    return rel_img_point\n\n\ndef point_sample(input: Tensor,\n                 points: Tensor,\n                 align_corners: bool = False,\n                 **kwargs) -> Tensor:\n    \"\"\"A wrapper around :func:`grid_sample` to support 3D point_coords tensors\n    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to\n    lie inside ``[0, 1] x [0, 1]`` square.\n\n    Args:\n        input (torch.Tensor): Feature map, shape (N, C, H, W).\n        points (torch.Tensor): Image based absolute point coordinates\n            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or\n            (N, Hgrid, Wgrid, 2).\n        align_corners (bool, optional): Whether align_corners.\n            Default: False\n\n    Returns:\n        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or\n        (N, C, Hgrid, Wgrid).\n    \"\"\"\n\n    add_dim = False\n    if points.dim() == 3:\n        add_dim = True\n        points = points.unsqueeze(2)\n    output = F.grid_sample(\n        input, denormalize(points), align_corners=align_corners, **kwargs)\n    if add_dim:\n        output = output.squeeze(3)\n    return output\n\n\nclass SimpleRoIAlign(nn.Module):\n\n    def __init__(self,\n                 output_size: Tuple[int],\n                 spatial_scale: float,\n                 aligned: bool = True) -> None:\n        \"\"\"Simple RoI align in PointRend, faster than standard RoIAlign.\n\n        Args:\n            output_size (tuple[int]): h, w\n            spatial_scale (float): scale the input boxes by this number\n            aligned (bool): if False, use the legacy implementation in\n                MMDetection, align_corners=True will be used in F.grid_sample.\n                If True, align the results more perfectly.\n        \"\"\"\n\n        super().__init__()\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n        # to be consistent with other RoI ops\n        self.use_torchvision = False\n        self.aligned = aligned\n\n    def forward(self, features: Tensor, rois: Tensor) -> Tensor:\n        num_imgs = features.size(0)\n        num_rois = rois.size(0)\n        rel_roi_points = generate_grid(\n            num_rois, self.output_size, device=rois.device)\n\n        point_feats = []\n        for batch_ind in range(num_imgs):\n            # unravel batch dim\n            feat = features[batch_ind].unsqueeze(0)\n            inds = (rois[:, 0].long() == batch_ind)\n            if inds.any():\n                rel_img_points = rel_roi_point_to_rel_img_point(\n                    rois[inds], rel_roi_points[inds], feat,\n                    self.spatial_scale).unsqueeze(0)\n                point_feat = point_sample(\n                    feat, rel_img_points, align_corners=not self.aligned)\n                point_feat = point_feat.squeeze(0).transpose(0, 1)\n                point_feats.append(point_feat)\n\n        point_feats_t = torch.cat(point_feats, dim=0)\n\n        channels = features.size(1)\n        roi_feats = point_feats_t.reshape(num_rois, channels,\n                                          *self.output_size)\n\n        return roi_feats\n\n    def __repr__(self) -> str:\n        format_str = self.__class__.__name__\n        format_str += '(output_size={}, spatial_scale={}'.format(\n            self.output_size, self.spatial_scale)\n        return format_str\n"
  },
  {
    "path": "mmcv/ops/points_in_boxes.py",
    "content": "import torch\nfrom mmengine.device import is_cuda_available, is_musa_available\nfrom torch import Tensor\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',\n    'points_in_boxes_all_forward'\n])\n\n\ndef points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:\n    \"\"\"Find the box in which each point is (CUDA/MUSA).\n\n    Args:\n        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.\n        boxes (torch.Tensor): [B, T, 7],\n            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in\n            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.\n\n    Returns:\n        torch.Tensor: Return the box indices of points with the shape of\n        (B, M). Default background = -1.\n    \"\"\"\n    assert points.shape[0] == boxes.shape[0], \\\n        'Points and boxes should have the same batch size, ' \\\n        f'but got {points.shape[0]} and {boxes.shape[0]}'\n    assert boxes.shape[2] == 7, \\\n        'boxes dimension should be 7, ' \\\n        f'but got unexpected shape {boxes.shape[2]}'\n    assert points.shape[2] == 3, \\\n        'points dimension should be 3, ' \\\n        f'but got unexpected shape {points.shape[2]}'\n    batch_size, num_points, _ = points.shape\n\n    box_idxs_of_pts = points.new_zeros((batch_size, num_points),\n                                       dtype=torch.int).fill_(-1)\n\n    # If manually put the tensor 'points' or 'boxes' on a device\n    # which is not the current device, some temporary variables\n    # will be created on the current device in the cuda/musa op,\n    # and the output will be incorrect.\n    # Therefore, we force the current device to be the same\n    # as the device of the tensors if it was not.\n    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305\n    # for the incorrect output before the fix.\n    points_device = points.get_device()\n    assert points_device == boxes.get_device(), \\\n        'Points and boxes should be put on the same device'\n    if points.device.type != 'npu':\n        if is_cuda_available():\n            if torch.cuda.current_device() != points_device:\n                torch.cuda.set_device(points_device)\n        elif is_musa_available():\n            if torch.musa.current_device() != points_device:\n                torch.musa.set_device(points_device)\n    else:\n        boxes[:, :, 2] += boxes[:, :, 5] / 2.0\n\n    ext_module.points_in_boxes_part_forward(boxes.contiguous(),\n                                            points.contiguous(),\n                                            box_idxs_of_pts)\n\n    return box_idxs_of_pts\n\n\ndef points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:\n    \"\"\"Find all boxes in which each point is (CPU). The CPU version of\n    :meth:`points_in_boxes_all`.\n\n    Args:\n        points (torch.Tensor): [B, M, 3], [x, y, z] in\n            LiDAR/DEPTH coordinate\n        boxes (torch.Tensor): [B, T, 7],\n            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],\n            (x, y, z) is the bottom center.\n\n    Returns:\n        torch.Tensor: Return the box indices of points with the shape of\n        (B, M, T). Default background = 0.\n    \"\"\"\n    assert points.shape[0] == boxes.shape[0], \\\n        'Points and boxes should have the same batch size, ' \\\n        f'but got {points.shape[0]} and {boxes.shape[0]}'\n    assert boxes.shape[2] == 7, \\\n        'boxes dimension should be 7, ' \\\n        f'but got unexpected shape {boxes.shape[2]}'\n    assert points.shape[2] == 3, \\\n        'points dimension should be 3, ' \\\n        f'but got unexpected shape {points.shape[2]}'\n    batch_size, num_points, _ = points.shape\n    num_boxes = boxes.shape[1]\n\n    point_indices = points.new_zeros((batch_size, num_boxes, num_points),\n                                     dtype=torch.int)\n    for b in range(batch_size):\n        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),\n                                               points[b].float().contiguous(),\n                                               point_indices[b])\n    point_indices = point_indices.transpose(1, 2)\n\n    return point_indices\n\n\ndef points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:\n    \"\"\"Find all boxes in which each point is (CUDA/MUSA).\n\n    Args:\n        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate\n        boxes (torch.Tensor): [B, T, 7],\n            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],\n            (x, y, z) is the bottom center.\n\n    Returns:\n        torch.Tensor: Return the box indices of points with the shape of\n        (B, M, T). Default background = 0.\n    \"\"\"\n    assert boxes.shape[0] == points.shape[0], \\\n        'Points and boxes should have the same batch size, ' \\\n        f'but got {boxes.shape[0]} and {boxes.shape[0]}'\n    assert boxes.shape[2] == 7, \\\n        'boxes dimension should be 7, ' \\\n        f'but got unexpected shape {boxes.shape[2]}'\n    assert points.shape[2] == 3, \\\n        'points dimension should be 3, ' \\\n        f'but got unexpected shape {points.shape[2]}'\n    batch_size, num_points, _ = points.shape\n    num_boxes = boxes.shape[1]\n\n    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),\n                                       dtype=torch.int).fill_(0)\n\n    # Same reason as line 25-32\n    points_device = points.get_device()\n    assert points_device == boxes.get_device(), \\\n        'Points and boxes should be put on the same device'\n    if points.device.type != 'npu':\n        if is_cuda_available():\n            if torch.cuda.current_device() != points_device:\n                torch.cuda.set_device(points_device)\n        elif is_musa_available():\n            if torch.musa.current_device() != points_device:\n                torch.musa.set_device(points_device)\n\n    ext_module.points_in_boxes_all_forward(boxes.contiguous(),\n                                           points.contiguous(),\n                                           box_idxs_of_pts)\n\n    return box_idxs_of_pts\n"
  },
  {
    "path": "mmcv/ops/points_in_polygons.py",
    "content": "import torch\nfrom torch import Tensor\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])\n\n\ndef points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:\n    \"\"\"Judging whether points are inside polygons, which is used in the ATSS\n    assignment for the rotated boxes.\n\n    It should be noted that when the point is just at the polygon boundary, the\n    judgment will be inaccurate, but the effect on assignment is limited.\n\n    Args:\n        points (torch.Tensor): It has shape (B, 2), indicating (x, y).\n            M means the number of predicted points.\n        polygons (torch.Tensor): It has shape (M, 8), indicating\n            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of\n            ground truth polygons.\n\n    Returns:\n        torch.Tensor: Return the result with the shape of (B, M),\n        1 indicates that the point is inside the polygon,\n        0 indicates that the point is outside the polygon.\n    \"\"\"\n    assert points.shape[1] == 2, \\\n        'points dimension should be 2, ' \\\n        f'but got unexpected shape {points.shape[1]}'\n    assert polygons.shape[1] == 8, \\\n        'polygons dimension should be 8, ' \\\n        f'but got unexpected shape {polygons.shape[1]}'\n    output = torch.zeros(\n        points.shape[0],\n        polygons.shape[0],\n        dtype=torch.float32,\n        device=points.device)\n    ext_module.points_in_polygons_forward(points.contiguous(),\n                                          polygons.contiguous(), output)\n    return output\n"
  },
  {
    "path": "mmcv/ops/points_sampler.py",
    "content": "from typing import List\n\nimport torch\nfrom torch import Tensor\nfrom torch import nn as nn\n\nfrom .furthest_point_sample import (furthest_point_sample,\n                                    furthest_point_sample_with_dist)\n\n\ndef calc_square_dist(point_feat_a: Tensor,\n                     point_feat_b: Tensor,\n                     norm: bool = True) -> Tensor:\n    \"\"\"Calculating square distance between a and b.\n\n    Args:\n        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.\n        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.\n        norm (bool, optional): Whether to normalize the distance.\n            Default: True.\n\n    Returns:\n        torch.Tensor: (B, N, M) Square distance between each point pair.\n    \"\"\"\n    num_channel = point_feat_a.shape[-1]\n    dist = torch.cdist(point_feat_a, point_feat_b)\n    if norm:\n        dist = dist / num_channel\n    else:\n        dist = torch.square(dist)\n    return dist\n\n\ndef get_sampler_cls(sampler_type: str) -> nn.Module:\n    \"\"\"Get the type and mode of points sampler.\n\n    Args:\n        sampler_type (str): The type of points sampler.\n            The valid value are \"D-FPS\", \"F-FPS\", or \"FS\".\n\n    Returns:\n        class: Points sampler type.\n    \"\"\"\n    sampler_mappings = {\n        'D-FPS': DFPSSampler,\n        'F-FPS': FFPSSampler,\n        'FS': FSSampler,\n    }\n    try:\n        return sampler_mappings[sampler_type]\n    except KeyError:\n        raise KeyError(\n            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \\\n                {sampler_type}')\n\n\nclass PointsSampler(nn.Module):\n    \"\"\"Points sampling.\n\n    Args:\n        num_point (list[int]): Number of sample points.\n        fps_mod_list (list[str], optional): Type of FPS method, valid mod\n            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].\n            F-FPS: using feature distances for FPS.\n            D-FPS: using Euclidean distances of points for FPS.\n            FS: using F-FPS and D-FPS simultaneously.\n        fps_sample_range_list (list[int], optional):\n            Range of points to apply FPS. Default: [-1].\n    \"\"\"\n\n    def __init__(self,\n                 num_point: List[int],\n                 fps_mod_list: List[str] = ['D-FPS'],\n                 fps_sample_range_list: List[int] = [-1]) -> None:\n        super().__init__()\n        # FPS would be applied to different fps_mod in the list,\n        # so the length of the num_point should be equal to\n        # fps_mod_list and fps_sample_range_list.\n        assert len(num_point) == len(fps_mod_list) == len(\n            fps_sample_range_list)\n        self.num_point = num_point\n        self.fps_sample_range_list = fps_sample_range_list\n        self.samplers = nn.ModuleList()\n        for fps_mod in fps_mod_list:\n            self.samplers.append(get_sampler_cls(fps_mod)())\n        self.fp16_enabled = False\n\n    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:\n        \"\"\"\n        Args:\n            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of\n                the points.\n            features (torch.Tensor): (B, C, N) features of the points.\n\n        Returns:\n            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.\n        \"\"\"\n        if points_xyz.dtype == torch.half:\n            points_xyz = points_xyz.to(torch.float32)\n        if features is not None and features.dtype == torch.half:\n            features = features.to(torch.float32)\n\n        indices = []\n        last_fps_end_index = 0\n        for fps_sample_range, sampler, npoint in zip(\n                self.fps_sample_range_list, self.samplers, self.num_point):\n            assert fps_sample_range < points_xyz.shape[1]\n\n            if fps_sample_range == -1:\n                sample_points_xyz = points_xyz[:, last_fps_end_index:]\n                if features is not None:\n                    sample_features = features[:, :, last_fps_end_index:]\n                else:\n                    sample_features = None\n            else:\n                sample_points_xyz = points_xyz[:, last_fps_end_index:\n                                               fps_sample_range]\n                if features is not None:\n                    sample_features = features[:, :, last_fps_end_index:\n                                               fps_sample_range]\n                else:\n                    sample_features = None\n\n            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,\n                              npoint)\n\n            indices.append(fps_idx + last_fps_end_index)\n            last_fps_end_index = fps_sample_range\n        indices = torch.cat(indices, dim=1)\n\n        return indices\n\n\nclass DFPSSampler(nn.Module):\n    \"\"\"Using Euclidean distances of points for FPS.\"\"\"\n\n    def __init__(self) -> None:\n        super().__init__()\n\n    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:\n        \"\"\"Sampling points with D-FPS.\"\"\"\n        fps_idx = furthest_point_sample(points.contiguous(), npoint)\n        return fps_idx\n\n\nclass FFPSSampler(nn.Module):\n    \"\"\"Using feature distances for FPS.\"\"\"\n\n    def __init__(self) -> None:\n        super().__init__()\n\n    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:\n        \"\"\"Sampling points with F-FPS.\"\"\"\n        assert features is not None, \\\n            'feature input to FFPS_Sampler should not be None'\n        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)\n        features_dist = calc_square_dist(\n            features_for_fps, features_for_fps, norm=False)\n        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)\n        return fps_idx\n\n\nclass FSSampler(nn.Module):\n    \"\"\"Using F-FPS and D-FPS simultaneously.\"\"\"\n\n    def __init__(self) -> None:\n        super().__init__()\n\n    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:\n        \"\"\"Sampling points with FS_Sampling.\"\"\"\n        assert features is not None, \\\n            'feature input to FS_Sampler should not be None'\n        ffps_sampler = FFPSSampler()\n        dfps_sampler = DFPSSampler()\n        fps_idx_ffps = ffps_sampler(points, features, npoint)\n        fps_idx_dfps = dfps_sampler(points, features, npoint)\n        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)\n        return fps_idx\n"
  },
  {
    "path": "mmcv/ops/prroi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.utils.dl_utils import TORCH_VERSION\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext',\n    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])\n\n\nclass PrRoIPoolFunction(Function):\n\n    @staticmethod\n    def symbolic(g, features, rois, output_size, spatial_scale):\n        return g.op(\n            'mmcv::PrRoIPool',\n            features,\n            rois,\n            pooled_height_i=int(output_size[0]),\n            pooled_width_i=int(output_size[1]),\n            spatial_scale_f=float(spatial_scale))\n\n    @staticmethod\n    def forward(ctx,\n                features: torch.Tensor,\n                rois: torch.Tensor,\n                output_size: Tuple,\n                spatial_scale: float = 1.0) -> torch.Tensor:\n        if features.dtype != torch.float32 or rois.dtype != torch.float32:\n            raise ValueError('Precise RoI Pooling only takes float input, got '\n                             f'{features.dtype()} for features and'\n                             f'{rois.dtype()} for rois.')\n\n        pooled_height = int(output_size[0])\n        pooled_width = int(output_size[1])\n        spatial_scale = float(spatial_scale)\n\n        features = features.contiguous()\n        rois = rois.contiguous()\n        output_shape = (rois.size(0), features.size(1), pooled_height,\n                        pooled_width)\n        output = features.new_zeros(output_shape)\n        params = (pooled_height, pooled_width, spatial_scale)\n\n        ext_module.prroi_pool_forward(\n            features,\n            rois,\n            output,\n            pooled_height=params[0],\n            pooled_width=params[1],\n            spatial_scale=params[2])\n        ctx.params = params\n        # everything here is contiguous.\n        ctx.save_for_backward(features, rois, output)\n\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(\n        ctx, grad_output: torch.Tensor\n    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:\n        features, rois, output = ctx.saved_tensors\n        grad_input = grad_output.new_zeros(*features.shape)\n        grad_coor = grad_output.new_zeros(*rois.shape)\n\n        if features.requires_grad or TORCH_VERSION == 'parrots':\n            grad_output = grad_output.contiguous()\n            ext_module.prroi_pool_backward(\n                grad_output,\n                rois,\n                grad_input,\n                pooled_height=ctx.params[0],\n                pooled_width=ctx.params[1],\n                spatial_scale=ctx.params[2])\n        if rois.requires_grad or TORCH_VERSION == 'parrots':\n            grad_output = grad_output.contiguous()\n            ext_module.prroi_pool_coor_backward(\n                output,\n                grad_output,\n                features,\n                rois,\n                grad_coor,\n                pooled_height=ctx.params[0],\n                pooled_width=ctx.params[1],\n                spatial_scale=ctx.params[2])\n\n        return grad_input, grad_coor, None, None, None\n\n\nprroi_pool = PrRoIPoolFunction.apply\n\n\nclass PrRoIPool(nn.Module):\n    \"\"\"The operation of precision RoI pooling. The implementation of PrRoIPool\n    is modified from https://github.com/vacancy/PreciseRoIPooling/\n\n    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear\n    interpolation) average pooling method for RoI Pooling. It avoids any\n    quantization and has a continuous gradient on bounding box coordinates.\n    It is:\n\n    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI\n    Pooling uses average pooling instead of max pooling for each bin and has a\n    continuous gradient on bounding box coordinates. That is, one can take the\n    derivatives of some loss function w.r.t the coordinates of each RoI and\n    optimize the RoI coordinates.\n    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses\n    a full integration-based average pooling instead of sampling a constant\n    number of points. This makes the gradient w.r.t. the coordinates\n    continuous.\n\n    Args:\n        output_size (Union[int, tuple]): h, w.\n        spatial_scale (float, optional): scale the input boxes by this number.\n            Defaults to 1.0.\n    \"\"\"\n\n    def __init__(self,\n                 output_size: Union[int, tuple],\n                 spatial_scale: float = 1.0):\n        super().__init__()\n\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n\n    def forward(self, features: torch.Tensor,\n                rois: torch.Tensor) -> torch.Tensor:\n        \"\"\"Forward function.\n\n        Args:\n            features (torch.Tensor): The feature map.\n            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]\n                format.\n\n        Returns:\n            torch.Tensor: The pooled results.\n        \"\"\"\n        return prroi_pool(features, rois, self.output_size, self.spatial_scale)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(output_size={self.output_size}, '\n        s += f'spatial_scale={self.spatial_scale})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/psa_mask.py",
    "content": "# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa\nfrom typing import Optional, Tuple\n\nimport torch\nfrom torch import nn\nfrom torch.autograd import Function\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext',\n                                 ['psamask_forward', 'psamask_backward'])\n\n\nclass PSAMaskFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, psa_type, mask_size):\n        return g.op(\n            'mmcv::MMCVPSAMask',\n            input,\n            psa_type_i=psa_type,\n            mask_size_i=mask_size)\n\n    @staticmethod\n    def forward(ctx, input: torch.Tensor, psa_type: str,\n                mask_size: int) -> torch.Tensor:\n        ctx.psa_type = psa_type\n        ctx.mask_size = _pair(mask_size)\n        ctx.save_for_backward(input)\n\n        h_mask, w_mask = ctx.mask_size\n        batch_size, channels, h_feature, w_feature = input.size()\n        assert channels == h_mask * w_mask\n        output = input.new_zeros(\n            (batch_size, h_feature * w_feature, h_feature, w_feature))\n\n        ext_module.psamask_forward(\n            input,\n            output,\n            psa_type=psa_type,\n            num_=batch_size,\n            h_feature=h_feature,\n            w_feature=w_feature,\n            h_mask=h_mask,\n            w_mask=w_mask,\n            half_h_mask=(h_mask - 1) // 2,\n            half_w_mask=(w_mask - 1) // 2)\n        return output\n\n    @staticmethod\n    def backward(\n            ctx, grad_output: torch.Tensor\n    ) -> Tuple[torch.Tensor, None, None, None]:\n        input = ctx.saved_tensors[0]\n        psa_type = ctx.psa_type\n        h_mask, w_mask = ctx.mask_size\n        batch_size, channels, h_feature, w_feature = input.size()\n        grad_input = grad_output.new_zeros(\n            (batch_size, channels, h_feature, w_feature))\n        ext_module.psamask_backward(\n            grad_output,\n            grad_input,\n            psa_type=psa_type,\n            num_=batch_size,\n            h_feature=h_feature,\n            w_feature=w_feature,\n            h_mask=h_mask,\n            w_mask=w_mask,\n            half_h_mask=(h_mask - 1) // 2,\n            half_w_mask=(w_mask - 1) // 2)\n        return grad_input, None, None, None\n\n\npsa_mask = PSAMaskFunction.apply\n\n\nclass PSAMask(nn.Module):\n\n    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):\n        super().__init__()\n        assert psa_type in ['collect', 'distribute']\n        if psa_type == 'collect':\n            psa_type_enum = 0\n        else:\n            psa_type_enum = 1\n        self.psa_type_enum = psa_type_enum\n        self.mask_size = mask_size\n        self.psa_type = psa_type\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        return psa_mask(input, self.psa_type_enum, self.mask_size)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(psa_type={self.psa_type}, '\n        s += f'mask_size={self.mask_size})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/riroi_align_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.utils import is_tuple_of\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])\n\n\nclass RiRoIAlignRotatedFunction(Function):\n\n    @staticmethod\n    def forward(ctx: Any,\n                features: torch.Tensor,\n                rois: torch.Tensor,\n                out_size: Union[int, tuple],\n                spatial_scale: float,\n                num_samples: int = 0,\n                num_orientations: int = 8,\n                clockwise: bool = False) -> torch.Tensor:\n        if isinstance(out_size, int):\n            out_h = out_size\n            out_w = out_size\n        elif is_tuple_of(out_size, int):\n            assert len(out_size) == 2\n            out_h, out_w = out_size\n        else:\n            raise TypeError(\n                f'\"out_size\" should be an integer or tuple of integers,'\n                f' but got {out_size}')\n        ctx.spatial_scale = spatial_scale\n        ctx.num_samples = num_samples\n        ctx.num_orientations = num_orientations\n        ctx.clockwise = clockwise\n        ctx.save_for_backward(rois)\n        ctx.feature_size = features.size()\n\n        batch_size, num_channels, _, _ = features.size()\n        num_rois = rois.size(0)\n\n        output = features.new_zeros(num_rois, num_channels, out_h, out_w)\n\n        ext_module.riroi_align_rotated_forward(\n            features,\n            rois,\n            output,\n            pooled_height=out_h,\n            pooled_width=out_w,\n            spatial_scale=spatial_scale,\n            num_samples=num_samples,\n            num_orientations=num_orientations,\n            clockwise=clockwise)\n        return output\n\n    @staticmethod\n    def backward(\n        ctx: Any, grad_output: torch.Tensor\n    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:\n        feature_size = ctx.feature_size\n        spatial_scale = ctx.spatial_scale\n        num_orientations = ctx.num_orientations\n        clockwise = ctx.clockwise\n        num_samples = ctx.num_samples\n        rois = ctx.saved_tensors[0]\n        assert feature_size is not None\n        batch_size, num_channels, feature_h, feature_w = feature_size\n\n        out_w = grad_output.size(3)\n        out_h = grad_output.size(2)\n\n        grad_input = None\n\n        if ctx.needs_input_grad[0]:\n            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,\n                                        feature_w)\n            ext_module.riroi_align_rotated_backward(\n                grad_output.contiguous(),\n                rois,\n                grad_input,\n                pooled_height=out_h,\n                pooled_width=out_w,\n                spatial_scale=spatial_scale,\n                num_samples=num_samples,\n                num_orientations=num_orientations,\n                clockwise=clockwise)\n\n            return grad_input, None, None, None, None, None, None\n        return None\n\n\nriroi_align_rotated = RiRoIAlignRotatedFunction.apply\n\n\nclass RiRoIAlignRotated(nn.Module):\n    \"\"\"Rotation-invariant RoI align pooling layer for rotated proposals.\n\n    It accepts a feature map of shape (N, C, H, W) and rois with shape\n    (n, 6) with each roi decoded as (batch_index, center_x, center_y,\n    w, h, angle). The angle is in radian.\n\n    The details are described in the paper `ReDet: A Rotation-equivariant\n    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.\n\n    Args:\n        out_size (tuple): fixed dimensional RoI output with shape (h, w).\n        spatial_scale (float): scale the input boxes by this number\n        num_samples (int): number of inputs samples to take for each\n            output sample. 0 to take samples densely for current models.\n        num_orientations (int): number of oriented channels.\n        clockwise (bool): If True, the angle in each proposal follows a\n            clockwise fashion in image space, otherwise, the angle is\n            counterclockwise. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 out_size: tuple,\n                 spatial_scale: float,\n                 num_samples: int = 0,\n                 num_orientations: int = 8,\n                 clockwise: bool = False):\n        super().__init__()\n\n        self.out_size = out_size\n        self.spatial_scale = float(spatial_scale)\n        self.num_samples = int(num_samples)\n        self.num_orientations = int(num_orientations)\n        self.clockwise = clockwise\n\n    def forward(self, features: torch.Tensor,\n                rois: torch.Tensor) -> torch.Tensor:\n        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,\n                                               self.spatial_scale,\n                                               self.num_samples,\n                                               self.num_orientations,\n                                               self.clockwise)\n"
  },
  {
    "path": "mmcv/ops/roi_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.utils import deprecated_api_warning\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext',\n                                 ['roi_align_forward', 'roi_align_backward'])\n\n\nclass RoIAlignFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,\n                 pool_mode, aligned):\n        from torch.onnx import TensorProtoDataType\n        from torch.onnx.symbolic_opset9 import sub\n\n        def _select(g, self, dim, index):\n            return g.op('Gather', self, index, axis_i=dim)\n\n        # batch_indices = rois[:, 0].long()\n        batch_indices = _select(\n            g, rois, 1,\n            g.op('Constant', value_t=torch.tensor([0], dtype=torch.long)))\n        batch_indices = g.op('Squeeze', batch_indices, axes_i=[1])\n        batch_indices = g.op(\n            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)\n        # rois = rois[:, 1:]\n        rois = _select(\n            g, rois, 1,\n            g.op(\n                'Constant',\n                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))\n\n        if aligned:\n            # rois -= 0.5/spatial_scale\n            aligned_offset = g.op(\n                'Constant',\n                value_t=torch.tensor([0.5 / spatial_scale],\n                                     dtype=torch.float32))\n            rois = sub(g, rois, aligned_offset)\n        # roi align\n        return g.op(\n            'RoiAlign',\n            input,\n            rois,\n            batch_indices,\n            output_height_i=output_size[0],\n            output_width_i=output_size[1],\n            spatial_scale_f=spatial_scale,\n            sampling_ratio_i=max(0, sampling_ratio),\n            mode_s=pool_mode)\n\n    @staticmethod\n    def forward(ctx: Any,\n                input: torch.Tensor,\n                rois: torch.Tensor,\n                output_size: int,\n                spatial_scale: float = 1.0,\n                sampling_ratio: int = 0,\n                pool_mode: str = 'avg',\n                aligned: bool = True) -> torch.Tensor:\n        ctx.output_size = _pair(output_size)\n        ctx.spatial_scale = spatial_scale\n        ctx.sampling_ratio = sampling_ratio\n        assert pool_mode in ('max', 'avg')\n        ctx.pool_mode = 0 if pool_mode == 'max' else 1\n        ctx.aligned = aligned\n        ctx.input_shape = input.size()\n\n        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'\n\n        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],\n                        ctx.output_size[1])\n        output = input.new_zeros(output_shape)\n        if ctx.pool_mode == 0:\n            argmax_y = input.new_zeros(output_shape)\n            argmax_x = input.new_zeros(output_shape)\n        else:\n            argmax_y = input.new_zeros(0)\n            argmax_x = input.new_zeros(0)\n\n        ext_module.roi_align_forward(\n            input,\n            rois,\n            output,\n            argmax_y,\n            argmax_x,\n            aligned_height=ctx.output_size[0],\n            aligned_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            pool_mode=ctx.pool_mode,\n            aligned=ctx.aligned)\n\n        ctx.save_for_backward(rois, argmax_y, argmax_x)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        rois, argmax_y, argmax_x = ctx.saved_tensors\n        grad_input = grad_output.new_zeros(ctx.input_shape)\n        # complex head architecture may cause grad_output uncontiguous.\n        grad_output = grad_output.contiguous()\n        ext_module.roi_align_backward(\n            grad_output,\n            rois,\n            argmax_y,\n            argmax_x,\n            grad_input,\n            aligned_height=ctx.output_size[0],\n            aligned_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            pool_mode=ctx.pool_mode,\n            aligned=ctx.aligned)\n        return grad_input, None, None, None, None, None, None\n\n\nroi_align = RoIAlignFunction.apply\n\n\nclass RoIAlign(nn.Module):\n    \"\"\"RoI align pooling layer.\n\n    Args:\n        output_size (tuple): h, w\n        spatial_scale (float): scale the input boxes by this number\n        sampling_ratio (int): number of inputs samples to take for each\n            output sample. 0 to take samples densely for current models.\n        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.\n        aligned (bool): if False, use the legacy implementation in\n            MMDetection. If True, align the results more perfectly.\n        use_torchvision (bool): whether to use roi_align from torchvision.\n\n    Note:\n        The implementation of RoIAlign when aligned=True is modified from\n        https://github.com/facebookresearch/detectron2/\n\n        The meaning of aligned=True:\n\n        Given a continuous coordinate c, its two neighboring pixel\n        indices (in our pixel model) are computed by floor(c - 0.5) and\n        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete\n        indices [0] and [1] (which are sampled from the underlying signal\n        at continuous coordinates 0.5 and 1.5). But the original roi_align\n        (aligned=False) does not subtract the 0.5 when computing\n        neighboring pixel indices and therefore it uses pixels with a\n        slightly incorrect alignment (relative to our pixel model) when\n        performing bilinear interpolation.\n\n        With `aligned=True`,\n        we first appropriately scale the ROI and then shift it by -0.5\n        prior to calling roi_align. This produces the correct neighbors;\n\n        The difference does not make a difference to the model's\n        performance if ROIAlign is used together with conv layers.\n    \"\"\"\n\n    @deprecated_api_warning(\n        {\n            'out_size': 'output_size',\n            'sample_num': 'sampling_ratio'\n        },\n        cls_name='RoIAlign')\n    def __init__(self,\n                 output_size: tuple,\n                 spatial_scale: float = 1.0,\n                 sampling_ratio: int = 0,\n                 pool_mode: str = 'avg',\n                 aligned: bool = True,\n                 use_torchvision: bool = False):\n        super().__init__()\n\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n        self.sampling_ratio = int(sampling_ratio)\n        self.pool_mode = pool_mode\n        self.aligned = aligned\n        self.use_torchvision = use_torchvision\n\n    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            input: NCHW images\n            rois: Bx5 boxes. First column is the index into N.\\\n                The other 4 columns are xyxy.\n        \"\"\"\n        if self.use_torchvision:\n            from torchvision.ops import roi_align as tv_roi_align\n            if 'aligned' in tv_roi_align.__code__.co_varnames:\n                return tv_roi_align(input, rois, self.output_size,\n                                    self.spatial_scale, self.sampling_ratio,\n                                    self.aligned)\n            else:\n                if self.aligned:\n                    rois -= rois.new_tensor([0.] +\n                                            [0.5 / self.spatial_scale] * 4)\n                return tv_roi_align(input, rois, self.output_size,\n                                    self.spatial_scale, self.sampling_ratio)\n        else:\n            return roi_align(input, rois, self.output_size, self.spatial_scale,\n                             self.sampling_ratio, self.pool_mode, self.aligned)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(output_size={self.output_size}, '\n        s += f'spatial_scale={self.spatial_scale}, '\n        s += f'sampling_ratio={self.sampling_ratio}, '\n        s += f'pool_mode={self.pool_mode}, '\n        s += f'aligned={self.aligned}, '\n        s += f'use_torchvision={self.use_torchvision})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/roi_align_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, Optional, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom mmengine.utils import deprecated_api_warning\nfrom torch.autograd import Function\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])\n\n\nclass RoIAlignRotatedFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,\n                 aligned, clockwise):\n        if isinstance(output_size, int):\n            out_h = output_size\n            out_w = output_size\n        elif isinstance(output_size, tuple):\n            assert len(output_size) == 2\n            assert isinstance(output_size[0], int)\n            assert isinstance(output_size[1], int)\n            out_h, out_w = output_size\n        else:\n            raise TypeError(\n                '\"output_size\" must be an integer or tuple of integers')\n        return g.op(\n            'mmcv::MMCVRoIAlignRotated',\n            input,\n            rois,\n            output_height_i=out_h,\n            output_width_i=out_h,\n            spatial_scale_f=spatial_scale,\n            sampling_ratio_i=sampling_ratio,\n            aligned_i=aligned,\n            clockwise_i=clockwise)\n\n    @staticmethod\n    def forward(ctx: Any,\n                input: torch.Tensor,\n                rois: torch.Tensor,\n                output_size: Union[int, tuple],\n                spatial_scale: float,\n                sampling_ratio: int = 0,\n                aligned: bool = True,\n                clockwise: bool = False) -> torch.Tensor:\n        ctx.output_size = _pair(output_size)\n        ctx.spatial_scale = spatial_scale\n        ctx.sampling_ratio = sampling_ratio\n        ctx.aligned = aligned\n        ctx.clockwise = clockwise\n        ctx.save_for_backward(rois)\n        ctx.feature_size = input.size()\n\n        batch_size, num_channels, data_height, data_width = input.size()\n        num_rois = rois.size(0)\n\n        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],\n                                 ctx.output_size[1])\n        ext_module.roi_align_rotated_forward(\n            input,\n            rois,\n            output,\n            pooled_height=ctx.output_size[0],\n            pooled_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale,\n            sampling_ratio=ctx.sampling_ratio,\n            aligned=ctx.aligned,\n            clockwise=ctx.clockwise)\n        return output\n\n    @staticmethod\n    def backward(\n        ctx: Any, grad_output: torch.Tensor\n    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,\n               None, None, None]:\n        feature_size = ctx.feature_size\n        rois = ctx.saved_tensors[0]\n        assert feature_size is not None\n        batch_size, num_channels, data_height, data_width = feature_size\n\n        out_w = grad_output.size(3)\n        out_h = grad_output.size(2)\n\n        grad_input = grad_rois = None\n\n        if ctx.needs_input_grad[0]:\n            grad_input = rois.new_zeros(batch_size, num_channels, data_height,\n                                        data_width)\n            ext_module.roi_align_rotated_backward(\n                grad_output.contiguous(),\n                rois,\n                grad_input,\n                pooled_height=out_h,\n                pooled_width=out_w,\n                spatial_scale=ctx.spatial_scale,\n                sampling_ratio=ctx.sampling_ratio,\n                aligned=ctx.aligned,\n                clockwise=ctx.clockwise)\n        return grad_input, grad_rois, None, None, None, None, None\n\n\nroi_align_rotated = RoIAlignRotatedFunction.apply\n\n\nclass RoIAlignRotated(nn.Module):\n    \"\"\"RoI align pooling layer for rotated proposals.\n\n    It accepts a feature map of shape (N, C, H, W) and rois with shape\n    (n, 6) with each roi decoded as (batch_index, center_x, center_y,\n    w, h, angle). The angle is in radian.\n\n    Args:\n        output_size (tuple): h, w\n        spatial_scale (float): scale the input boxes by this number\n        sampling_ratio(int): number of inputs samples to take for each\n            output sample. 0 to take samples densely for current models.\n        aligned (bool): if False, use the legacy implementation in\n            MMDetection. If True, align the results more perfectly.\n            Default: True.\n        clockwise (bool): If True, the angle in each proposal follows a\n            clockwise fashion in image space, otherwise, the angle is\n            counterclockwise. Default: False.\n\n    Note:\n        The implementation of RoIAlign when aligned=True is modified from\n        https://github.com/facebookresearch/detectron2/\n\n        The meaning of aligned=True:\n\n        Given a continuous coordinate c, its two neighboring pixel\n        indices (in our pixel model) are computed by floor(c - 0.5) and\n        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete\n        indices [0] and [1] (which are sampled from the underlying signal\n        at continuous coordinates 0.5 and 1.5). But the original roi_align\n        (aligned=False) does not subtract the 0.5 when computing\n        neighboring pixel indices and therefore it uses pixels with a\n        slightly incorrect alignment (relative to our pixel model) when\n        performing bilinear interpolation.\n\n        With `aligned=True`,\n        we first appropriately scale the ROI and then shift it by -0.5\n        prior to calling roi_align. This produces the correct neighbors;\n\n        The difference does not make a difference to the model's\n        performance if ROIAlign is used together with conv layers.\n    \"\"\"\n\n    @deprecated_api_warning(\n        {\n            'out_size': 'output_size',\n            'sample_num': 'sampling_ratio'\n        },\n        cls_name='RoIAlignRotated')\n    def __init__(self,\n                 output_size: Union[int, tuple],\n                 spatial_scale: float,\n                 sampling_ratio: int = 0,\n                 aligned: bool = True,\n                 clockwise: bool = False):\n        super().__init__()\n\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n        self.sampling_ratio = int(sampling_ratio)\n        self.aligned = aligned\n        self.clockwise = clockwise\n\n    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:\n        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,\n                                             self.spatial_scale,\n                                             self.sampling_ratio, self.aligned,\n                                             self.clockwise)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(output_size={self.output_size}, '\n        s += f'spatial_scale={self.spatial_scale}, '\n        s += f'sampling_ratio={self.sampling_ratio}, '\n        s += f'aligned={self.aligned}, '\n        s += f'clockwise={self.clockwise})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/roi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, Tuple, Union\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext',\n                                 ['roi_pool_forward', 'roi_pool_backward'])\n\n\nclass RoIPoolFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, rois, output_size, spatial_scale):\n        return g.op(\n            'MaxRoiPool',\n            input,\n            rois,\n            pooled_shape_i=output_size,\n            spatial_scale_f=spatial_scale)\n\n    @staticmethod\n    def forward(ctx: Any,\n                input: torch.Tensor,\n                rois: torch.Tensor,\n                output_size: Union[int, tuple],\n                spatial_scale: float = 1.0) -> torch.Tensor:\n        ctx.output_size = _pair(output_size)\n        ctx.spatial_scale = spatial_scale\n        ctx.input_shape = input.size()\n\n        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'\n\n        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],\n                        ctx.output_size[1])\n        output = input.new_zeros(output_shape)\n        argmax = input.new_zeros(output_shape, dtype=torch.int)\n\n        ext_module.roi_pool_forward(\n            input,\n            rois,\n            output,\n            argmax,\n            pooled_height=ctx.output_size[0],\n            pooled_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale)\n\n        ctx.save_for_backward(rois, argmax)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(\n            ctx: Any, grad_output: torch.Tensor\n    ) -> Tuple[torch.Tensor, None, None, None]:\n        rois, argmax = ctx.saved_tensors\n        grad_input = grad_output.new_zeros(ctx.input_shape)\n\n        ext_module.roi_pool_backward(\n            grad_output,\n            rois,\n            argmax,\n            grad_input,\n            pooled_height=ctx.output_size[0],\n            pooled_width=ctx.output_size[1],\n            spatial_scale=ctx.spatial_scale)\n\n        return grad_input, None, None, None\n\n\nroi_pool = RoIPoolFunction.apply\n\n\nclass RoIPool(nn.Module):\n\n    def __init__(self,\n                 output_size: Union[int, tuple],\n                 spatial_scale: float = 1.0):\n        super().__init__()\n\n        self.output_size = _pair(output_size)\n        self.spatial_scale = float(spatial_scale)\n\n    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:\n        return roi_pool(input, rois, self.output_size, self.spatial_scale)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'(output_size={self.output_size}, '\n        s += f'spatial_scale={self.spatial_scale})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/roiaware_pool3d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, Tuple, Union\n\nimport mmengine\nimport torch\nfrom torch import nn as nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])\n\n\nclass RoIAwarePool3d(nn.Module):\n    \"\"\"Encode the geometry-specific features of each 3D proposal.\n\n    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more\n    details.\n\n    Args:\n        out_size (int or tuple): The size of output features. n or\n            [n1, n2, n3].\n        max_pts_per_voxel (int, optional): The maximum number of points per\n            voxel. Default: 128.\n        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.\n            Default: 'max'.\n    \"\"\"\n\n    def __init__(self,\n                 out_size: Union[int, tuple],\n                 max_pts_per_voxel: int = 128,\n                 mode: str = 'max'):\n        super().__init__()\n\n        self.out_size = out_size\n        self.max_pts_per_voxel = max_pts_per_voxel\n        assert mode in ['max', 'avg']\n        pool_mapping = {'max': 0, 'avg': 1}\n        self.mode = pool_mapping[mode]\n\n    def forward(self, rois: torch.Tensor, pts: torch.Tensor,\n                pts_feature: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            rois (torch.Tensor): [N, 7], in LiDAR coordinate,\n                (x, y, z) is the bottom center of rois.\n            pts (torch.Tensor): [npoints, 3], coordinates of input points.\n            pts_feature (torch.Tensor): [npoints, C], features of input points.\n\n        Returns:\n            torch.Tensor: Pooled features whose shape is\n            [N, out_x, out_y, out_z, C].\n        \"\"\"\n\n        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,\n                                            self.out_size,\n                                            self.max_pts_per_voxel, self.mode)\n\n\nclass RoIAwarePool3dFunction(Function):\n\n    @staticmethod\n    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,\n                pts_feature: torch.Tensor, out_size: Union[int, tuple],\n                max_pts_per_voxel: int, mode: int) -> torch.Tensor:\n        \"\"\"\n        Args:\n            rois (torch.Tensor): [N, 7], in LiDAR coordinate,\n                (x, y, z) is the bottom center of rois.\n            pts (torch.Tensor): [npoints, 3], coordinates of input points.\n            pts_feature (torch.Tensor): [npoints, C], features of input points.\n            out_size (int or tuple): The size of output features. n or\n                [n1, n2, n3].\n            max_pts_per_voxel (int): The maximum number of points per voxel.\n                Default: 128.\n            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average\n                pool).\n\n        Returns:\n            torch.Tensor: Pooled features whose shape is\n            [N, out_x, out_y, out_z, C].\n        \"\"\"\n\n        if isinstance(out_size, int):\n            out_x = out_y = out_z = out_size\n        else:\n            assert len(out_size) == 3\n            assert mmengine.is_tuple_of(out_size, int)\n            out_x, out_y, out_z = out_size\n\n        num_rois = rois.shape[0]\n        num_channels = pts_feature.shape[-1]\n        num_pts = pts.shape[0]\n\n        pooled_features = pts_feature.new_zeros(\n            (num_rois, out_x, out_y, out_z, num_channels))\n        argmax = pts_feature.new_zeros(\n            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)\n        pts_idx_of_voxels = pts_feature.new_zeros(\n            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),\n            dtype=torch.int)\n\n        ext_module.roiaware_pool3d_forward(\n            rois,\n            pts,\n            pts_feature,\n            argmax,\n            pts_idx_of_voxels,\n            pooled_features,\n            pool_method=mode)\n\n        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,\n                                            num_pts, num_channels)\n        return pooled_features\n\n    @staticmethod\n    def backward(\n        ctx: Any, grad_out: torch.Tensor\n    ) -> Tuple[None, None, torch.Tensor, None, None, None]:\n        ret = ctx.roiaware_pool3d_for_backward\n        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret\n\n        grad_in = grad_out.new_zeros((num_pts, num_channels))\n        ext_module.roiaware_pool3d_backward(\n            pts_idx_of_voxels,\n            argmax,\n            grad_out.contiguous(),\n            grad_in,\n            pool_method=mode)\n\n        return None, None, grad_in, None, None, None\n"
  },
  {
    "path": "mmcv/ops/roipoint_pool3d.py",
    "content": "from typing import Any, Tuple\n\nimport torch\nfrom torch import nn as nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])\n\n\nclass RoIPointPool3d(nn.Module):\n    \"\"\"Encode the geometry-specific features of each 3D proposal.\n\n    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_\n    for more details.\n\n    Args:\n        num_sampled_points (int, optional): Number of samples in each roi.\n            Default: 512.\n    \"\"\"\n\n    def __init__(self, num_sampled_points: int = 512):\n        super().__init__()\n        self.num_sampled_points = num_sampled_points\n\n    def forward(self, points: torch.Tensor, point_features: torch.Tensor,\n                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:\n        \"\"\"\n        Args:\n            points (torch.Tensor): Input points whose shape is (B, N, C).\n            point_features (torch.Tensor): Features of input points whose shape\n                is (B, N, C).\n            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains two elements. The first one\n            is the pooled features whose shape is (B, M, 512, 3 + C). The\n            second is an empty flag whose shape is (B, M).\n        \"\"\"\n        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,\n                                            self.num_sampled_points)\n\n\nclass RoIPointPool3dFunction(Function):\n\n    @staticmethod\n    def forward(\n            ctx: Any,\n            points: torch.Tensor,\n            point_features: torch.Tensor,\n            boxes3d: torch.Tensor,\n            num_sampled_points: int = 512\n    ) -> Tuple[torch.Tensor, torch.Tensor]:\n        \"\"\"\n        Args:\n            points (torch.Tensor): Input points whose shape is (B, N, C).\n            point_features (torch.Tensor): Features of input points whose shape\n                is (B, N, C).\n            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).\n            num_sampled_points (int, optional): The num of sampled points.\n                Default: 512.\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains two elements. The first one\n            is the pooled features whose shape is (B, M, 512, 3 + C). The\n            second is an empty flag whose shape is (B, M).\n        \"\"\"\n        assert len(points.shape) == 3 and points.shape[2] == 3\n        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[\n            1], point_features.shape[2]\n        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)\n        pooled_features = point_features.new_zeros(\n            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))\n        pooled_empty_flag = point_features.new_zeros(\n            (batch_size, boxes_num)).int()\n\n        ext_module.roipoint_pool3d_forward(points.contiguous(),\n                                           pooled_boxes3d.contiguous(),\n                                           point_features.contiguous(),\n                                           pooled_features, pooled_empty_flag)\n\n        return pooled_features, pooled_empty_flag\n\n    @staticmethod\n    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:\n        raise NotImplementedError\n"
  },
  {
    "path": "mmcv/ops/rotated_feature_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any\n\nimport torch\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext',\n    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])\n\n\nclass RotatedFeatureAlignFunction(Function):\n    \"\"\"Using the feature interpolation to obtain the position information\n    correspond to the refined rotate anchors and reconstruct the feature maps\n    in pixel-wise manner to achieve feature alignment.\n\n    The details are described in the paper `R3Det: Refined Single-Stage\n    Detector with Feature Refinement for Rotating Object\n    <https://arxiv.org/abs/1908.05612>`_.\n    \"\"\"\n\n    @staticmethod\n    def symbolic(g, features, best_rbboxes, spatial_scale, points):\n        assert points in [1, 5]\n        return g.op(\n            'mmcv::MMCVRotatedFeatureAlign',\n            features,\n            best_rbboxes,\n            spatial_scale_f=spatial_scale,\n            points_i=points)\n\n    @staticmethod\n    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,\n                spatial_scale: float, points: int) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): Input features with shape [N,C,H,W].\n            best_rbboxes (torch.Tensor): Refined rotate anchors with\n                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).\n            spatial_scale (float): The scale of feature map size and\n                input image size.\n            points (int, optional): The number of sample points.\n                Only 1 and 5 are supported. Defaults to 1.\n\n        Returns:\n            torch.Tensor: Refined features with shape [N,C,H,W].\n        \"\"\"\n        ctx.spatial_scale = spatial_scale\n        ctx.points = points\n        ctx.save_for_backward(best_rbboxes)\n        assert points in [1, 5]\n        output = torch.zeros_like(features)\n        ext_module.rotated_feature_align_forward(\n            features,\n            best_rbboxes,\n            output,\n            spatial_scale=spatial_scale,\n            points=points)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        \"\"\"\n        Args:\n            grad_output (torch.Tensor): The gradient of output features\n                with shape [N,C,H,W].\n\n        Returns:\n            torch.Tensor: The gradient of input features with shape [N,C,H,W].\n        \"\"\"\n        best_rbboxes = ctx.saved_tensors[0]\n        points = ctx.points\n        spatial_scale = ctx.spatial_scale\n        grad_input = None\n        if ctx.needs_input_grad[0]:\n            grad_input = torch.zeros_like(grad_output)\n            ext_module.rotated_feature_align_backward(\n                grad_output.contiguous(),\n                best_rbboxes,\n                grad_input,\n                spatial_scale=spatial_scale,\n                points=points)\n        return grad_input, None, None, None\n\n\ndef rotated_feature_align(features: torch.Tensor,\n                          best_rbboxes: torch.Tensor,\n                          spatial_scale: float = 1 / 8,\n                          points: int = 1) -> torch.Tensor:\n    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,\n                                             spatial_scale, points)\n"
  },
  {
    "path": "mmcv/ops/saconv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmengine.model import constant_init\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfrom mmcv.cnn import ConvAWS2d\nfrom mmcv.ops.deform_conv import deform_conv2d\n\n\n@MODELS.register_module(name='SAC')\nclass SAConv2d(ConvAWS2d):\n    \"\"\"SAC (Switchable Atrous Convolution)\n\n    This is an implementation of `DetectoRS: Detecting Objects with Recursive\n    Feature Pyramid and Switchable Atrous Convolution\n    <https://arxiv.org/abs/2006.02334>`_.\n\n    Args:\n        in_channels (int): Number of channels in the input image\n        out_channels (int): Number of channels produced by the convolution\n        kernel_size (int or tuple): Size of the convolving kernel\n        stride (int or tuple, optional): Stride of the convolution. Default: 1\n        padding (int or tuple, optional): Zero-padding added to both sides of\n            the input. Default: 0\n        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,\n            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``\n        dilation (int or tuple, optional): Spacing between kernel elements.\n            Default: 1\n        groups (int, optional): Number of blocked connections from input\n            channels to output channels. Default: 1\n        bias (bool, optional): If ``True``, adds a learnable bias to the\n            output. Default: ``True``\n        use_deform: If ``True``, replace convolution with deformable\n            convolution. Default: ``False``.\n    \"\"\"\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 use_deform=False):\n        super().__init__(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            groups=groups,\n            bias=bias)\n        self.use_deform = use_deform\n        self.switch = nn.Conv2d(\n            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)\n        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))\n        self.pre_context = nn.Conv2d(\n            self.in_channels, self.in_channels, kernel_size=1, bias=True)\n        self.post_context = nn.Conv2d(\n            self.out_channels, self.out_channels, kernel_size=1, bias=True)\n        if self.use_deform:\n            self.offset_s = nn.Conv2d(\n                self.in_channels,\n                18,\n                kernel_size=3,\n                padding=1,\n                stride=stride,\n                bias=True)\n            self.offset_l = nn.Conv2d(\n                self.in_channels,\n                18,\n                kernel_size=3,\n                padding=1,\n                stride=stride,\n                bias=True)\n        self.init_weights()\n\n    def init_weights(self):\n        constant_init(self.switch, 0, bias=1)\n        self.weight_diff.data.zero_()\n        constant_init(self.pre_context, 0)\n        constant_init(self.post_context, 0)\n        if self.use_deform:\n            constant_init(self.offset_s, 0)\n            constant_init(self.offset_l, 0)\n\n    def forward(self, x):\n        # pre-context\n        avg_x = F.adaptive_avg_pool2d(x, output_size=1)\n        avg_x = self.pre_context(avg_x)\n        avg_x = avg_x.expand_as(x)\n        x = x + avg_x\n        # switch\n        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')\n        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)\n        switch = self.switch(avg_x)\n        # sac\n        weight = self._get_weight(self.weight)\n        zero_bias = torch.zeros(\n            self.out_channels, device=weight.device, dtype=weight.dtype)\n\n        if self.use_deform:\n            offset = self.offset_s(avg_x)\n            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,\n                                  self.dilation, self.groups, 1)\n        else:\n            if (TORCH_VERSION == 'parrots'\n                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):\n                out_s = super().conv2d_forward(x, weight)\n            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):\n                # bias is a required argument of _conv_forward in torch 1.8.0\n                out_s = super()._conv_forward(x, weight, zero_bias)\n            else:\n                out_s = super()._conv_forward(x, weight)\n        ori_p = self.padding\n        ori_d = self.dilation\n        self.padding = tuple(3 * p for p in self.padding)\n        self.dilation = tuple(3 * d for d in self.dilation)\n        weight = weight + self.weight_diff\n        if self.use_deform:\n            offset = self.offset_l(avg_x)\n            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,\n                                  self.dilation, self.groups, 1)\n        else:\n            if (TORCH_VERSION == 'parrots'\n                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):\n                out_l = super().conv2d_forward(x, weight)\n            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):\n                # bias is a required argument of _conv_forward in torch 1.8.0\n                out_l = super()._conv_forward(x, weight, zero_bias)\n            else:\n                out_l = super()._conv_forward(x, weight)\n\n        out = switch * out_s + (1 - switch) * out_l\n        self.padding = ori_p\n        self.dilation = ori_d\n        # post-context\n        avg_x = F.adaptive_avg_pool2d(out, output_size=1)\n        avg_x = self.post_context(avg_x)\n        avg_x = avg_x.expand_as(out)\n        out = out + avg_x\n        return out\n"
  },
  {
    "path": "mmcv/ops/scatter_points.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, List, Optional, Tuple\n\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext',\n    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])\n\n\nclass _DynamicScatter(Function):\n\n    @staticmethod\n    def forward(ctx: Any,\n                feats: torch.Tensor,\n                coors: torch.Tensor,\n                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:\n        \"\"\"Convert kitti points(N, >=3) to voxels.\n\n        Args:\n            feats (torch.Tensor): [N, C]. Points features to be reduced\n                into voxels.\n            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates\n                (specifically multi-dim voxel index) of each points.\n            reduce_type (str, optional): Reduce op. support 'max', 'sum' and\n                'mean'. Default: 'max'.\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains two elements. The first one\n            is the voxel features with shape [M, C] which are respectively\n            reduced from input features that share the same voxel coordinates.\n            The second is voxel coordinates with shape [M, ndim].\n        \"\"\"\n        results = ext_module.dynamic_point_to_voxel_forward(\n            feats, coors, reduce_type)\n        (voxel_feats, voxel_coors, point2voxel_map,\n         voxel_points_count) = results\n        ctx.reduce_type = reduce_type\n        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,\n                              voxel_points_count)\n        ctx.mark_non_differentiable(voxel_coors)\n        return voxel_feats, voxel_coors\n\n    @staticmethod\n    def backward(ctx: Any,\n                 grad_voxel_feats: torch.Tensor,\n                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:\n        (feats, voxel_feats, point2voxel_map,\n         voxel_points_count) = ctx.saved_tensors\n        grad_feats = torch.zeros_like(feats)\n        # TODO: whether to use index put or use cuda_backward\n        # To use index put, need point to voxel index\n        ext_module.dynamic_point_to_voxel_backward(\n            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,\n            point2voxel_map, voxel_points_count, ctx.reduce_type)\n        return grad_feats, None, None\n\n\ndynamic_scatter = _DynamicScatter.apply\n\n\nclass DynamicScatter(nn.Module):\n    \"\"\"Scatters points into voxels, used in the voxel encoder with dynamic\n    voxelization.\n\n    Note:\n        The CPU and GPU implementation get the same output, but have numerical\n        difference after summation and division (e.g., 5e-7).\n\n    Args:\n        voxel_size (list): list [x, y, z] size of three dimension.\n        point_cloud_range (list): The coordinate range of points, [x_min,\n            y_min, z_min, x_max, y_max, z_max].\n        average_points (bool): whether to use avg pooling to scatter points\n            into voxel.\n    \"\"\"\n\n    def __init__(self, voxel_size: List, point_cloud_range: List,\n                 average_points: bool):\n        super().__init__()\n\n        self.voxel_size = voxel_size\n        self.point_cloud_range = point_cloud_range\n        self.average_points = average_points\n\n    def forward_single(\n            self, points: torch.Tensor,\n            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:\n        \"\"\"Scatters points into voxels.\n\n        Args:\n            points (torch.Tensor): Points to be reduced into voxels.\n            coors (torch.Tensor): Corresponding voxel coordinates (specifically\n                multi-dim voxel index) of each points.\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains two elements. The first one\n            is the voxel features with shape [M, C] which are respectively\n            reduced from input features that share the same voxel coordinates.\n            The second is voxel coordinates with shape [M, ndim].\n        \"\"\"\n        reduce = 'mean' if self.average_points else 'max'\n        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)\n\n    def forward(self, points: torch.Tensor,\n                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:\n        \"\"\"Scatters points/features into voxels.\n\n        Args:\n            points (torch.Tensor): Points to be reduced into voxels.\n            coors (torch.Tensor): Corresponding voxel coordinates (specifically\n                multi-dim voxel index) of each points.\n\n        Returns:\n            tuple[torch.Tensor]: A tuple contains two elements. The first one\n            is the voxel features with shape [M, C] which are respectively\n            reduced from input features that share the same voxel coordinates.\n            The second is voxel coordinates with shape [M, ndim].\n        \"\"\"\n        if coors.size(-1) == 3:\n            return self.forward_single(points, coors)\n        else:\n            batch_size = coors[-1, 0] + 1\n            voxels, voxel_coors = [], []\n            for i in range(batch_size):\n                inds = torch.where(coors[:, 0] == i)\n                voxel, voxel_coor = self.forward_single(\n                    points[inds], coors[inds][:, 1:])\n                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)\n                voxel_coors.append(coor_pad)\n                voxels.append(voxel)\n            features = torch.cat(voxels, dim=0)\n            feature_coors = torch.cat(voxel_coors, dim=0)\n\n            return features, feature_coors\n\n    def __repr__(self):\n        s = self.__class__.__name__ + '('\n        s += 'voxel_size=' + str(self.voxel_size)\n        s += ', point_cloud_range=' + str(self.point_cloud_range)\n        s += ', average_points=' + str(self.average_points)\n        s += ')'\n        return s\n"
  },
  {
    "path": "mmcv/ops/sparse_conv.py",
    "content": "# Copyright 2019 Yan Yan\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport math\n\nimport numpy as np\nimport torch\nfrom mmengine.registry import MODELS\nfrom torch.nn import init\nfrom torch.nn.parameter import Parameter\n\nfrom . import sparse_functional as Fsp\nfrom . import sparse_ops as ops\nfrom .sparse_modules import SparseModule\nfrom .sparse_structure import SparseConvTensor\n\n\ndef _calculate_fan_in_and_fan_out_hwio(tensor):\n    dimensions = tensor.ndimension()\n    if dimensions < 2:\n        raise ValueError('fan in and fan out can not be computed for tensor'\n                         'with fewer than 2 dimensions')\n\n    if dimensions == 2:  # Linear\n        fan_in = tensor.size(-2)\n        fan_out = tensor.size(-1)\n    else:\n        num_input_fmaps = tensor.size(-2)\n        num_output_fmaps = tensor.size(-1)\n        receptive_field_size = 1\n        if tensor.dim() > 2:\n            receptive_field_size = tensor[..., 0, 0].numel()\n        fan_in = num_input_fmaps * receptive_field_size\n        fan_out = num_output_fmaps * receptive_field_size\n\n    return fan_in, fan_out\n\n\nclass SparseConvolution(SparseModule):\n\n    def __init__(self,\n                 ndim,\n                 in_channels,\n                 out_channels,\n                 kernel_size=3,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 subm=False,\n                 output_padding=0,\n                 transposed=False,\n                 inverse=False,\n                 indice_key=None,\n                 fused_bn=False):\n        super().__init__()\n        assert groups == 1\n        if not isinstance(kernel_size, (list, tuple)):\n            kernel_size = [kernel_size] * ndim\n        if not isinstance(stride, (list, tuple)):\n            stride = [stride] * ndim\n        if not isinstance(padding, (list, tuple)):\n            padding = [padding] * ndim\n        if not isinstance(dilation, (list, tuple)):\n            dilation = [dilation] * ndim\n        if not isinstance(output_padding, (list, tuple)):\n            output_padding = [output_padding] * ndim\n\n        for d, s in zip(dilation, stride):\n            assert any([s == 1, d == 1]), \"don't support this.\"\n\n        self.ndim = ndim\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.kernel_size = kernel_size\n        self.conv1x1 = np.prod(kernel_size) == 1\n        self.stride = stride\n        self.padding = padding\n        self.dilation = dilation\n        self.transposed = transposed\n        self.inverse = inverse\n        self.output_padding = output_padding\n        self.groups = groups\n        self.subm = subm\n        self.indice_key = indice_key\n        self.fused_bn = fused_bn\n\n        self.weight = Parameter(\n            torch.Tensor(*kernel_size, in_channels, out_channels))\n        if bias:\n            self.bias = Parameter(torch.Tensor(out_channels))\n        else:\n            self.register_parameter('bias', None)\n        self.reset_parameters()\n\n    def reset_parameters(self):\n        init.kaiming_uniform_(self.weight, a=math.sqrt(5))\n        if self.bias is not None:\n            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)\n            bound = 1 / math.sqrt(fan_in)\n            init.uniform_(self.bias, -bound, bound)\n\n    def forward(self, input):\n        assert isinstance(input, SparseConvTensor)\n        features = input.features\n        device = features.device\n        indices = input.indices\n        spatial_shape = input.spatial_shape\n        batch_size = input.batch_size\n        if not self.subm:\n            if self.transposed:\n                out_spatial_shape = ops.get_deconv_output_size(\n                    spatial_shape, self.kernel_size, self.stride, self.padding,\n                    self.dilation, self.output_padding)\n            else:\n                out_spatial_shape = ops.get_conv_output_size(\n                    spatial_shape, self.kernel_size, self.stride, self.padding,\n                    self.dilation)\n\n        else:\n            out_spatial_shape = spatial_shape\n\n        if self.conv1x1:\n            features = torch.mm(\n                input.features,\n                self.weight.view(self.in_channels, self.out_channels))\n            if self.bias is not None:\n                features += self.bias\n            out_tensor = SparseConvTensor(features, input.indices,\n                                          input.spatial_shape,\n                                          input.batch_size)\n            out_tensor.indice_dict = input.indice_dict\n            out_tensor.grid = input.grid\n            return out_tensor\n        data = input.find_indice_pair(self.indice_key)\n        if self.inverse:\n            assert data is not None and self.indice_key is not None\n            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data\n            assert indice_pairs.shape[0] == np.prod(\n                self.kernel_size\n            ), 'inverse conv must have same kernel size as its couple conv'\n        else:\n            if self.indice_key is not None and data is not None:\n                outids, _, indice_pairs, indice_pair_num, _ = data\n            else:\n                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(\n                    indices,\n                    batch_size,\n                    spatial_shape,\n                    self.kernel_size,\n                    self.stride,\n                    self.padding,\n                    self.dilation,\n                    self.output_padding,\n                    self.subm,\n                    self.transposed,\n                    grid=input.grid)\n                input.indice_dict[self.indice_key] = (outids, indices,\n                                                      indice_pairs,\n                                                      indice_pair_num,\n                                                      spatial_shape)\n        if self.fused_bn:\n            assert self.bias is not None\n            out_features = ops.fused_indice_conv(features, self.weight,\n                                                 self.bias,\n                                                 indice_pairs.to(device),\n                                                 indice_pair_num,\n                                                 outids.shape[0], self.inverse,\n                                                 self.subm)\n        else:\n            if self.subm:\n                out_features = Fsp.indice_subm_conv(features, self.weight,\n                                                    indice_pairs.to(device),\n                                                    indice_pair_num,\n                                                    outids.shape[0])\n            else:\n                if self.inverse:\n                    out_features = Fsp.indice_inverse_conv(\n                        features, self.weight, indice_pairs.to(device),\n                        indice_pair_num, outids.shape[0])\n                else:\n                    out_features = Fsp.indice_conv(features, self.weight,\n                                                   indice_pairs.to(device),\n                                                   indice_pair_num,\n                                                   outids.shape[0])\n\n            if self.bias is not None:\n                out_features += self.bias\n        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,\n                                      batch_size)\n        out_tensor.indice_dict = input.indice_dict\n        out_tensor.grid = input.grid\n        return out_tensor\n\n\n@MODELS.register_module()\nclass SparseConv2d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            2,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseConv3d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            3,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseConv4d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            4,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseConvTranspose2d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            2,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            transposed=True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseConvTranspose3d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            3,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            transposed=True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseInverseConv2d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 indice_key=None,\n                 bias=True):\n        super().__init__(\n            2,\n            in_channels,\n            out_channels,\n            kernel_size,\n            bias=bias,\n            inverse=True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SparseInverseConv3d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 indice_key=None,\n                 bias=True):\n        super().__init__(\n            3,\n            in_channels,\n            out_channels,\n            kernel_size,\n            bias=bias,\n            inverse=True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SubMConv2d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            2,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SubMConv3d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            3,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            True,\n            indice_key=indice_key)\n\n\n@MODELS.register_module()\nclass SubMConv4d(SparseConvolution):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 indice_key=None):\n        super().__init__(\n            4,\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride,\n            padding,\n            dilation,\n            groups,\n            bias,\n            True,\n            indice_key=indice_key)\n"
  },
  {
    "path": "mmcv/ops/sparse_functional.py",
    "content": "# Copyright 2019 Yan Yan\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom typing import Any\n\nimport torch\nfrom torch.autograd import Function\n\nfrom . import sparse_ops as ops\n\n\nclass SparseConvFunction(Function):\n    \"\"\"Sparse Convolution.\n\n    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for\n    more details.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,\n                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,\n                num_activate_out: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): Features that needs to convolute.\n            filters (torch.nn.parameter.Parameter): Convolution filters.\n            indice_pairs (torch.Tensor): Indice pairs between inputs locations\n                and outputs locations.\n            indice_pair_num (torch.Tensor): Indice pairs num.\n            num_activate_out (torch.Tensor): Output channels num.\n\n        Returns:\n            torch.Tensor: Output features from gather-gemm-scatter.\n        \"\"\"\n        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)\n        return ops.indice_conv(features, filters, indice_pairs,\n                               indice_pair_num, num_activate_out, False)\n\n    @staticmethod\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors\n        input_bp, filters_bp = ops.indice_conv_backward(\n            features, filters, grad_output, indice_pairs, indice_pair_num,\n            False)\n\n        return input_bp, filters_bp, None, None, None\n\n\nclass SparseInverseConvFunction(Function):\n\n    @staticmethod\n    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,\n                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,\n                num_activate_out: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): Features that needs to convolute.\n            filters (torch.nn.parameter.Parameter): Convolution filters.\n            indice_pairs (torch.Tensor): Indice pairs between inputs locations\n                and outputs locations.\n            indice_pair_num (torch.Tensor): Indice pairs num.\n            num_activate_out (torch.Tensor): Output channels num.\n\n        Returns:\n            torch.Tensor: Output features from gather-gemm-scatter.\n        \"\"\"\n        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)\n        return ops.indice_conv(features, filters, indice_pairs,\n                               indice_pair_num, num_activate_out, True, False)\n\n    @staticmethod\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors\n        input_bp, filters_bp = ops.indice_conv_backward(\n            features, filters, grad_output, indice_pairs, indice_pair_num,\n            True, False)\n\n        return input_bp, filters_bp, None, None, None\n\n\nclass SubMConvFunction(Function):\n\n    @staticmethod\n    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,\n                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,\n                num_activate_out: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): Features that needs to convolute.\n            filters (torch.nn.parameter.Parameter): Convolution filters.\n            indice_pairs (torch.Tensor): Indice pairs between inputs locations\n                and outputs locations.\n            indice_pair_num (torch.Tensor): Indice pairs num.\n            num_activate_out (torch.Tensor): Output channels num.\n\n        Returns:\n            torch.Tensor: Output features from gather-gemm-scatter.\n        \"\"\"\n        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)\n        return ops.indice_conv(features, filters, indice_pairs,\n                               indice_pair_num, num_activate_out, False, True)\n\n    @staticmethod\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors\n        input_bp, filters_bp = ops.indice_conv_backward(\n            features, filters, grad_output, indice_pairs, indice_pair_num,\n            False, True)\n\n        return input_bp, filters_bp, None, None, None\n\n\nclass SparseMaxPoolFunction(Function):\n\n    @staticmethod\n    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,\n                indice_pair_num: torch.Tensor,\n                num_activate_out: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): Features that needs to convolute.\n            indice_pairs (torch.Tensor): Indice pairs between inputs locations\n                and outputs locations.\n            indice_pair_num (torch.Tensor): Indice pairs num.\n            num_activate_out (torch.Tensor): Output channels num.\n\n        Returns:\n            torch.Tensor: Output features from sparse maxpooling.\n        \"\"\"\n        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,\n                                 num_activate_out)\n        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)\n        return out\n\n    @staticmethod\n    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:\n        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors\n        input_bp = ops.indice_maxpool_backward(features, out, grad_output,\n                                               indice_pairs, indice_pair_num)\n        return input_bp, None, None, None\n\n\nindice_conv = SparseConvFunction.apply\nindice_inverse_conv = SparseInverseConvFunction.apply\nindice_subm_conv = SubMConvFunction.apply\nindice_maxpool = SparseMaxPoolFunction.apply\n"
  },
  {
    "path": "mmcv/ops/sparse_modules.py",
    "content": "# Copyright 2019 Yan Yan\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport sys\nfrom collections import OrderedDict\nfrom typing import Any, List, Optional, Union\n\nimport torch\nfrom torch import nn\n\nfrom .sparse_structure import SparseConvTensor\n\n\ndef is_spconv_module(module: nn.Module) -> bool:\n    spconv_modules = (SparseModule, )\n    return isinstance(module, spconv_modules)\n\n\ndef is_sparse_conv(module: nn.Module) -> bool:\n    from .sparse_conv import SparseConvolution\n    return isinstance(module, SparseConvolution)\n\n\ndef _mean_update(vals: Union[int, List], m_vals: Union[int, List],\n                 t: float) -> List:\n    outputs = []\n    if not isinstance(vals, list):\n        vals = [vals]\n    if not isinstance(m_vals, list):\n        m_vals = [m_vals]\n    for val, m_val in zip(vals, m_vals):\n        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val\n        outputs.append(output)\n    if len(outputs) == 1:\n        outputs = outputs[0]\n    return outputs\n\n\nclass SparseModule(nn.Module):\n    \"\"\"Place holder, All module subclass from this will take sptensor in\n    SparseSequential.\"\"\"\n    pass\n\n\nclass SparseSequential(SparseModule):\n    r\"\"\"A sequential container. Modules will be added to it in the order they\n    are passed in the constructor. Alternatively, an ordered dict of modules\n    can also be passed in.\n\n    To make it easier to understand, given is a small example::\n\n    Example:\n        >>> # using Sequential:\n        >>> from mmcv.ops import SparseSequential\n        >>> model = SparseSequential(\n                    SparseConv2d(1,20,5),\n                    nn.ReLU(),\n                    SparseConv2d(20,64,5),\n                    nn.ReLU()\n                    )\n\n        >>> # using Sequential with OrderedDict\n        >>> model = SparseSequential(OrderedDict([\n                      ('conv1', SparseConv2d(1,20,5)),\n                      ('relu1', nn.ReLU()),\n                      ('conv2', SparseConv2d(20,64,5)),\n                      ('relu2', nn.ReLU())\n                    ]))\n\n        >>> # using Sequential with kwargs(python 3.6+)\n        >>> model = SparseSequential(\n                      conv1=SparseConv2d(1,20,5),\n                      relu1=nn.ReLU(),\n                      conv2=SparseConv2d(20,64,5),\n                      relu2=nn.ReLU()\n                    )\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super().__init__()\n        if len(args) == 1 and isinstance(args[0], OrderedDict):\n            for key, module in args[0].items():\n                self.add_module(key, module)\n        else:\n            for idx, module in enumerate(args):\n                self.add_module(str(idx), module)\n        for name, module in kwargs.items():\n            if sys.version_info < (3, 6):\n                raise ValueError('kwargs only supported in py36+')\n            if name in self._modules:\n                raise ValueError('name exists.')\n            self.add_module(name, module)\n        self._sparity_dict = {}\n\n    def __getitem__(self, idx: int) -> torch.Tensor:\n        if not (-len(self) <= idx < len(self)):\n            raise IndexError(f'index {idx} is out of range')\n        if idx < 0:\n            idx += len(self)\n        it = iter(self._modules.values())\n        for i in range(idx):\n            next(it)\n        return next(it)\n\n    def __len__(self):\n        return len(self._modules)\n\n    @property\n    def sparity_dict(self):\n        return self._sparity_dict\n\n    def add(self, module: Any, name: Optional[str] = None) -> None:\n        if name is None:\n            name = str(len(self._modules))\n            if name in self._modules:\n                raise KeyError('name exists')\n        self.add_module(name, module)\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        for k, module in self._modules.items():\n            if is_spconv_module(module):\n                assert isinstance(input, SparseConvTensor)\n                self._sparity_dict[k] = input.sparity\n                input = module(input)\n            else:\n                if isinstance(input, SparseConvTensor):\n                    if input.indices.shape[0] != 0:\n                        input.features = module(input.features)\n                else:\n                    input = module(input)\n        return input\n\n    def fused(self):\n        from .sparse_conv import SparseConvolution\n        mods = [v for k, v in self._modules.items()]\n        fused_mods = []\n        idx = 0\n        while idx < len(mods):\n            if is_sparse_conv(mods[idx]):\n                if idx < len(mods) - 1 and isinstance(mods[idx + 1],\n                                                      nn.BatchNorm1d):\n                    new_module = SparseConvolution(\n                        ndim=mods[idx].ndim,\n                        in_channels=mods[idx].in_channels,\n                        out_channels=mods[idx].out_channels,\n                        kernel_size=mods[idx].kernel_size,\n                        stride=mods[idx].stride,\n                        padding=mods[idx].padding,\n                        dilation=mods[idx].dilation,\n                        groups=mods[idx].groups,\n                        bias=True,\n                        subm=mods[idx].subm,\n                        output_padding=mods[idx].output_padding,\n                        transposed=mods[idx].transposed,\n                        inverse=mods[idx].inverse,\n                        indice_key=mods[idx].indice_key,\n                        fused_bn=True,\n                    )\n                    new_module.load_state_dict(mods[idx].state_dict(), False)\n                    new_module.to(mods[idx].weight.device)\n                    conv = new_module\n                    bn = mods[idx + 1]\n                    conv.bias.data.zero_()\n                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (\n                        torch.sqrt(bn.running_var) + bn.eps)\n                    conv.bias.data[:] = (\n                        conv.bias.data - bn.running_mean) * bn.weight.data / (\n                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data\n                    fused_mods.append(conv)\n                    idx += 2\n                else:\n                    fused_mods.append(mods[idx])\n                    idx += 1\n            else:\n                fused_mods.append(mods[idx])\n                idx += 1\n        return SparseSequential(*fused_mods)\n\n\nclass ToDense(SparseModule):\n    \"\"\"Convert SparseConvTensor to NCHW dense tensor.\"\"\"\n\n    def forward(self, x: SparseConvTensor):\n        return x.dense()\n\n\nclass RemoveGrid(SparseModule):\n    \"\"\"Remove pre-allocated grid buffer.\"\"\"\n\n    def forward(self, x: SparseConvTensor):\n        x.grid = None\n        return x\n"
  },
  {
    "path": "mmcv/ops/sparse_ops.py",
    "content": "# Copyright 2019 Yan Yan\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport torch\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',\n    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',\n    'get_indice_pairs_3d_backward', 'indice_conv_forward',\n    'indice_conv_backward', 'fused_indice_conv_forward',\n    'indice_maxpool_forward', 'indice_maxpool_backward'\n])\n\n\ndef get_conv_output_size(input_size, kernel_size, stride, padding, dilation):\n    ndim = len(input_size)\n    output_size = []\n    for i in range(ndim):\n        size = (input_size[i] + 2 * padding[i] - dilation[i] *\n                (kernel_size[i] - 1) - 1) // stride[i] + 1\n        if kernel_size[i] == -1:\n            output_size.append(1)\n        else:\n            output_size.append(size)\n    return output_size\n\n\ndef get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,\n                           output_padding):\n    ndim = len(input_size)\n    output_size = []\n    for i in range(ndim):\n        if kernel_size[i] == -1:\n            raise ValueError(\"deconv don't support kernel_size < 0\")\n        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[\n            i] + output_padding[i]\n        output_size.append(size)\n    return output_size\n\n\ndef get_indice_pairs(indices,\n                     batch_size,\n                     spatial_shape,\n                     ksize=3,\n                     stride=1,\n                     padding=0,\n                     dilation=1,\n                     out_padding=0,\n                     subm=False,\n                     transpose=False,\n                     grid=None):\n    ndim = indices.shape[1] - 1\n    if not isinstance(ksize, (list, tuple)):\n        ksize = [ksize] * ndim\n    if not isinstance(stride, (list, tuple)):\n        stride = [stride] * ndim\n    if not isinstance(padding, (list, tuple)):\n        padding = [padding] * ndim\n    if not isinstance(dilation, (list, tuple)):\n        dilation = [dilation] * ndim\n    if not isinstance(out_padding, (list, tuple)):\n        out_padding = [out_padding] * ndim\n\n    for d, s in zip(dilation, stride):\n        assert any([s == 1, d == 1]), \"don't support this.\"\n\n    if not subm:\n        if transpose:\n            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,\n                                               padding, dilation, out_padding)\n        else:\n            out_shape = get_conv_output_size(spatial_shape, ksize, stride,\n                                             padding, dilation)\n\n    else:\n        out_shape = spatial_shape\n    if grid is None:\n        if ndim == 2:\n            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward\n        elif ndim == 3:\n            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward\n        elif ndim == 4:\n            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward\n        else:\n            raise NotImplementedError\n        return get_indice_pairs_func(indices, batch_size, out_shape,\n                                     spatial_shape, ksize, stride, padding,\n                                     dilation, out_padding, int(subm),\n                                     int(transpose))\n    else:\n        if ndim == 2:\n            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward\n        elif ndim == 3:\n            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward\n        else:\n            raise NotImplementedError\n        return get_indice_pairs_func(indices, grid, batch_size, out_shape,\n                                     spatial_shape, ksize, stride, padding,\n                                     dilation, out_padding, int(subm),\n                                     int(transpose))\n\n\ndef indice_conv(features,\n                filters,\n                indice_pairs,\n                indice_pair_num,\n                num_activate_out,\n                inverse=False,\n                subm=False):\n    if filters.dtype == torch.float32 or filters.dtype == torch.half:\n        return ext_module.indice_conv_forward(features, filters, indice_pairs,\n                                              indice_pair_num,\n                                              num_activate_out, int(inverse),\n                                              int(subm))\n    else:\n        raise NotImplementedError\n\n\ndef fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,\n                      num_activate_out, inverse, subm):\n    if features.dtype == torch.half or filters.dtypes == torch.float32:\n        func = ext_module.fused_indice_conv_forward\n    else:\n        raise NotImplementedError\n\n    return func(features, filters, bias, indice_pairs, indice_pair_num,\n                num_activate_out, int(inverse), int(subm))\n\n\ndef indice_conv_backward(features,\n                         filters,\n                         out_bp,\n                         indice_pairs,\n                         indice_pair_num,\n                         inverse=False,\n                         subm=False):\n    if filters.dtype == torch.float32 or filters.dtype == torch.half:\n        return ext_module.indice_conv_backward(features, filters, out_bp,\n                                               indice_pairs, indice_pair_num,\n                                               int(inverse), int(subm))\n    else:\n        raise NotImplementedError\n\n\ndef indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):\n    if features.dtype == torch.float32 or features.dtype == torch.half:\n        return ext_module.indice_maxpool_forward(features, indice_pairs,\n                                                 indice_pair_num,\n                                                 num_activate_out)\n    else:\n        raise NotImplementedError\n\n\ndef indice_maxpool_backward(features, out_features, out_bp, indice_pairs,\n                            indice_pair_num):\n    if features.dtype == torch.float32 or features.dtype == torch.half:\n        return ext_module.indice_maxpool_backward(features, out_features,\n                                                  out_bp, indice_pairs,\n                                                  indice_pair_num)\n    else:\n        raise NotImplementedError\n"
  },
  {
    "path": "mmcv/ops/sparse_pool.py",
    "content": "# Copyright 2019 Yan Yan\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# import sparse_functional as Fsp\n# import sparse_ops as ops\nfrom .sparse_functional import indice_maxpool\nfrom .sparse_modules import SparseModule\nfrom .sparse_ops import get_conv_output_size, get_indice_pairs\nfrom .sparse_structure import SparseConvTensor\n\n\nclass SparseMaxPool(SparseModule):\n\n    def __init__(self,\n                 ndim,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 subm=False):\n        super().__init__()\n        if not isinstance(kernel_size, (list, tuple)):\n            kernel_size = [kernel_size] * ndim\n        if not isinstance(stride, (list, tuple)):\n            stride = [stride] * ndim\n        if not isinstance(padding, (list, tuple)):\n            padding = [padding] * ndim\n        if not isinstance(dilation, (list, tuple)):\n            dilation = [dilation] * ndim\n\n        self.ndim = ndim\n        self.kernel_size = kernel_size\n        self.stride = stride\n        self.padding = padding\n        self.subm = subm\n        self.dilation = dilation\n\n    def forward(self, input):\n        assert isinstance(input, SparseConvTensor)\n        features = input.features\n        device = features.device\n        indices = input.indices\n        spatial_shape = input.spatial_shape\n        batch_size = input.batch_size\n        if not self.subm:\n            out_spatial_shape = get_conv_output_size(spatial_shape,\n                                                     self.kernel_size,\n                                                     self.stride, self.padding,\n                                                     self.dilation)\n        else:\n            out_spatial_shape = spatial_shape\n        outids, indice_pairs, indice_pairs_num = get_indice_pairs(\n            indices, batch_size, spatial_shape, self.kernel_size, self.stride,\n            self.padding, self.dilation, 0, self.subm)\n\n        out_features = indice_maxpool(features, indice_pairs.to(device),\n                                      indice_pairs_num.to(device),\n                                      outids.shape[0])\n        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,\n                                      batch_size)\n        out_tensor.indice_dict = input.indice_dict\n        out_tensor.grid = input.grid\n        return out_tensor\n\n\nclass SparseMaxPool2d(SparseMaxPool):\n\n    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):\n        super().__init__(2, kernel_size, stride, padding, dilation)\n\n\nclass SparseMaxPool3d(SparseMaxPool):\n\n    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):\n        super().__init__(3, kernel_size, stride, padding, dilation)\n"
  },
  {
    "path": "mmcv/ops/sparse_structure.py",
    "content": "from typing import List, Optional, Tuple, Union\n\nimport numpy as np\nimport torch\n\n\ndef scatter_nd(indices: torch.Tensor, updates: torch.Tensor,\n               shape: torch.Tensor) -> torch.Tensor:\n    \"\"\"Pytorch edition of tensorflow scatter_nd.\n\n    this function don't contain except handle code. so use this carefully when\n    indice repeats, don't support repeat add which is supported in tensorflow.\n    \"\"\"\n    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)\n    ndim = indices.shape[-1]\n    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]\n    flatted_indices = indices.view(-1, ndim)\n    slices = [flatted_indices[:, i] for i in range(ndim)]\n    slices += [Ellipsis]\n    ret[slices] = updates.view(*output_shape)\n    return ret\n\n\nclass SparseConvTensor:\n\n    def __init__(self,\n                 features: torch.Tensor,\n                 indices: torch.Tensor,\n                 spatial_shape: Union[List, Tuple],\n                 batch_size: int,\n                 grid: Optional[torch.Tensor] = None):\n        self.features = features\n        self.indices = indices\n        if self.indices.dtype != torch.int32:\n            self.indices.int()\n        self.spatial_shape = spatial_shape\n        self.batch_size = batch_size\n        self.indice_dict: dict = {}\n        self.grid = grid\n\n    @property\n    def spatial_size(self):\n        return np.prod(self.spatial_shape)\n\n    def find_indice_pair(self, key):\n        if key is None:\n            return None\n        if key in self.indice_dict:\n            return self.indice_dict[key]\n        return None\n\n    def dense(self, channels_first: bool = True) -> torch.Tensor:\n        output_shape = [self.batch_size] + list(\n            self.spatial_shape) + [self.features.shape[1]]\n        res = scatter_nd(self.indices.long(), self.features, output_shape)\n        if not channels_first:\n            return res\n        ndim = len(self.spatial_shape)\n        trans_params = list(range(0, ndim + 1))\n        trans_params.insert(1, ndim + 1)\n        return res.permute(*trans_params).contiguous()\n\n    @property\n    def sparity(self):\n        return (self.indices.shape[0] / np.prod(self.spatial_shape) /\n                self.batch_size)\n"
  },
  {
    "path": "mmcv/ops/sync_bn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional\n\nimport torch\nimport torch.distributed as dist\nimport torch.nn.functional as F\nfrom mmengine.device import is_cuda_available, is_musa_available\nfrom mmengine.registry import MODELS\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.module import Module\nfrom torch.nn.parameter import Parameter\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', [\n    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',\n    'sync_bn_backward_param', 'sync_bn_backward_data'\n])\n\n\nclass SyncBatchNormFunction(Function):\n\n    @staticmethod\n    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,\n                 eps, group, group_size, stats_mode):\n        return g.op(\n            'mmcv::MMCVSyncBatchNorm',\n            input,\n            running_mean,\n            running_var,\n            weight,\n            bias,\n            momentum_f=momentum,\n            eps_f=eps,\n            group_i=group,\n            group_size_i=group_size,\n            stats_mode=stats_mode)\n\n    @staticmethod\n    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,\n                running_var: torch.Tensor, weight: torch.Tensor,\n                bias: torch.Tensor, momentum: float, eps: float, group: int,\n                group_size: int, stats_mode: str) -> torch.Tensor:\n        self.momentum = momentum\n        self.eps = eps\n        self.group = group\n        self.group_size = group_size\n        self.stats_mode = stats_mode\n\n        if is_cuda_available():\n            assert isinstance(\n                    input, (torch.HalfTensor, torch.FloatTensor,\n                            torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \\\n                f'only support Half or Float Tensor, but {input.type()}'\n        elif is_musa_available():\n            assert isinstance(\n                    input, (torch.HalfTensor, torch.FloatTensor,\n                            torch.musa.HalfTensor, torch.musa.FloatTensor)), \\\n                f'only support Half or Float Tensor, but {input.type()}'\n        else:\n            assert isinstance(\n                    input, (torch.HalfTensor, torch.FloatTensor)), \\\n                f'only support Half or Float Tensor, but {input.type()}'\n        output = torch.zeros_like(input)\n        input3d = input.flatten(start_dim=2)\n        output3d = output.view_as(input3d)\n        num_channels = input3d.size(1)\n\n        # ensure mean/var/norm/std are initialized as zeros\n        # ``torch.empty()`` does not guarantee that\n        mean = torch.zeros(\n            num_channels, dtype=torch.float, device=input3d.device)\n        var = torch.zeros(\n            num_channels, dtype=torch.float, device=input3d.device)\n        norm = torch.zeros_like(\n            input3d, dtype=torch.float, device=input3d.device)\n        std = torch.zeros(\n            num_channels, dtype=torch.float, device=input3d.device)\n\n        batch_size = input3d.size(0)\n        if batch_size > 0:\n            ext_module.sync_bn_forward_mean(input3d, mean)\n            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)\n        else:\n            # skip updating mean and leave it as zeros when the input is empty\n            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)\n\n        # synchronize mean and the batch flag\n        vec = torch.cat([mean, batch_flag])\n        if self.stats_mode == 'N':\n            vec *= batch_size\n        if self.group_size > 1:\n            dist.all_reduce(vec, group=self.group)\n        total_batch = vec[-1].detach()\n        mean = vec[:num_channels]\n\n        if self.stats_mode == 'default':\n            mean = mean / self.group_size\n        elif self.stats_mode == 'N':\n            mean = mean / total_batch.clamp(min=1)\n        else:\n            raise NotImplementedError\n\n        # leave var as zeros when the input is empty\n        if batch_size > 0:\n            ext_module.sync_bn_forward_var(input3d, mean, var)\n\n        if self.stats_mode == 'N':\n            var *= batch_size\n        if self.group_size > 1:\n            dist.all_reduce(var, group=self.group)\n\n        if self.stats_mode == 'default':\n            var /= self.group_size\n        elif self.stats_mode == 'N':\n            var /= total_batch.clamp(min=1)\n        else:\n            raise NotImplementedError\n\n        # if the total batch size over all the ranks is zero,\n        # we should not update the statistics in the current batch\n        update_flag = total_batch.clamp(max=1)\n        momentum = update_flag * self.momentum\n        ext_module.sync_bn_forward_output(\n            input3d,\n            mean,\n            var,\n            weight,\n            bias,\n            running_mean,\n            running_var,\n            norm,\n            std,\n            output3d,\n            eps=self.eps,\n            momentum=momentum,\n            group_size=self.group_size)\n        self.save_for_backward(norm, std, weight)\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(self, grad_output: torch.Tensor) -> tuple:\n        norm, std, weight = self.saved_tensors\n        grad_weight = torch.zeros_like(weight)\n        grad_bias = torch.zeros_like(weight)\n        grad_input = torch.zeros_like(grad_output)\n        grad_output3d = grad_output.flatten(start_dim=2)\n        grad_input3d = grad_input.view_as(grad_output3d)\n\n        batch_size = grad_input3d.size(0)\n        if batch_size > 0:\n            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,\n                                              grad_bias)\n\n        # all reduce\n        if self.group_size > 1:\n            dist.all_reduce(grad_weight, group=self.group)\n            dist.all_reduce(grad_bias, group=self.group)\n            grad_weight /= self.group_size\n            grad_bias /= self.group_size\n\n        if batch_size > 0:\n            ext_module.sync_bn_backward_data(grad_output3d, weight,\n                                             grad_weight, grad_bias, norm, std,\n                                             grad_input3d)\n\n        return grad_input, None, None, grad_weight, grad_bias, \\\n            None, None, None, None, None\n\n\n@MODELS.register_module(name='MMSyncBN')\nclass SyncBatchNorm(Module):\n    \"\"\"Synchronized Batch Normalization.\n\n    Args:\n        num_features (int): number of features/chennels in input tensor\n        eps (float, optional): a value added to the denominator for numerical\n            stability. Defaults to 1e-5.\n        momentum (float, optional): the value used for the running_mean and\n            running_var computation. Defaults to 0.1.\n        affine (bool, optional): whether to use learnable affine parameters.\n            Defaults to True.\n        track_running_stats (bool, optional): whether to track the running\n            mean and variance during training. When set to False, this\n            module does not track such statistics, and initializes statistics\n            buffers ``running_mean`` and ``running_var`` as ``None``. When\n            these buffers are ``None``, this module always uses batch\n            statistics in both training and eval modes. Defaults to True.\n        group (int, optional): synchronization of stats happen within\n            each process group individually. By default it is synchronization\n            across the whole world. Defaults to None.\n        stats_mode (str, optional): The statistical mode. Available options\n            includes ``'default'`` and ``'N'``. Defaults to 'default'.\n            When ``stats_mode=='default'``, it computes the overall statistics\n            using those from each worker with equal weight, i.e., the\n            statistics are synchronized and simply divied by ``group``. This\n            mode will produce inaccurate statistics when empty tensors occur.\n            When ``stats_mode=='N'``, it compute the overall statistics using\n            the total number of batches in each worker ignoring the number of\n            group, i.e., the statistics are synchronized and then divied by\n            the total batch ``N``. This mode is beneficial when empty tensors\n            occur during training, as it average the total mean by the real\n            number of batch.\n    \"\"\"\n\n    def __init__(self,\n                 num_features: int,\n                 eps: float = 1e-5,\n                 momentum: float = 0.1,\n                 affine: bool = True,\n                 track_running_stats: bool = True,\n                 group: Optional[int] = None,\n                 stats_mode: str = 'default'):\n        super().__init__()\n        self.num_features = num_features\n        self.eps = eps\n        self.momentum = momentum\n        self.affine = affine\n        self.track_running_stats = track_running_stats\n        group = dist.group.WORLD if group is None else group\n        self.group = group\n        self.group_size = dist.get_world_size(group)\n        assert stats_mode in ['default', 'N'], \\\n            f'\"stats_mode\" only accepts \"default\" and \"N\", got \"{stats_mode}\"'\n        self.stats_mode = stats_mode\n        if self.affine:\n            self.weight = Parameter(torch.Tensor(num_features))\n            self.bias = Parameter(torch.Tensor(num_features))\n        else:\n            self.register_parameter('weight', None)\n            self.register_parameter('bias', None)\n        if self.track_running_stats:\n            self.register_buffer('running_mean', torch.zeros(num_features))\n            self.register_buffer('running_var', torch.ones(num_features))\n            self.register_buffer('num_batches_tracked',\n                                 torch.tensor(0, dtype=torch.long))\n        else:\n            self.register_buffer('running_mean', None)\n            self.register_buffer('running_var', None)\n            self.register_buffer('num_batches_tracked', None)\n        self.reset_parameters()\n\n    def reset_running_stats(self):\n        if self.track_running_stats:\n            self.running_mean.zero_()\n            self.running_var.fill_(1)\n            self.num_batches_tracked.zero_()\n\n    def reset_parameters(self):\n        self.reset_running_stats()\n        if self.affine:\n            self.weight.data.uniform_()  # pytorch use ones_()\n            self.bias.data.zero_()\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        if input.dim() < 2:\n            raise ValueError(\n                f'expected at least 2D input, got {input.dim()}D input')\n        if self.momentum is None:\n            exponential_average_factor = 0.0\n        else:\n            exponential_average_factor = self.momentum\n\n        if self.training and self.track_running_stats:\n            if self.num_batches_tracked is not None:\n                self.num_batches_tracked += 1\n                if self.momentum is None:  # use cumulative moving average\n                    exponential_average_factor = 1.0 / float(\n                        self.num_batches_tracked)\n                else:  # use exponential moving average\n                    exponential_average_factor = self.momentum\n\n        if self.training or not self.track_running_stats:\n            return SyncBatchNormFunction.apply(\n                input, self.running_mean, self.running_var, self.weight,\n                self.bias, exponential_average_factor, self.eps, self.group,\n                self.group_size, self.stats_mode)\n        else:\n            return F.batch_norm(input, self.running_mean, self.running_var,\n                                self.weight, self.bias, False,\n                                exponential_average_factor, self.eps)\n\n    def __repr__(self):\n        s = self.__class__.__name__\n        s += f'({self.num_features}, '\n        s += f'eps={self.eps}, '\n        s += f'momentum={self.momentum}, '\n        s += f'affine={self.affine}, '\n        s += f'track_running_stats={self.track_running_stats}, '\n        s += f'group_size={self.group_size},'\n        s += f'stats_mode={self.stats_mode})'\n        return s\n"
  },
  {
    "path": "mmcv/ops/three_interpolate.py",
    "content": "from typing import Any, Tuple\n\nimport torch\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])\n\n\nclass ThreeInterpolate(Function):\n    \"\"\"Performs weighted linear interpolation on 3 features.\n\n    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_\n    for more details.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,\n                weight: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Args:\n            features (torch.Tensor): (B, C, M) Features descriptors to be\n                interpolated.\n            indices (torch.Tensor): (B, n, 3) indices of three nearest\n                neighbor features for the target features.\n            weight (torch.Tensor): (B, n, 3) weights of three nearest\n                neighbor features for the target features.\n\n        Returns:\n            torch.Tensor: (B, C, N) tensor of the interpolated features\n        \"\"\"\n        assert features.is_contiguous()\n        assert indices.is_contiguous()\n        assert weight.is_contiguous()\n\n        B, c, m = features.size()\n        n = indices.size(1)\n        ctx.three_interpolate_for_backward = (indices, weight, m)\n        output = features.new_empty(B, c, n)\n\n        ext_module.three_interpolate_forward(\n            features, indices, weight, output, b=B, c=c, m=m, n=n)\n        return output\n\n    @staticmethod\n    def backward(\n        ctx, grad_out: torch.Tensor\n    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:\n        \"\"\"\n        Args:\n            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs\n\n        Returns:\n            torch.Tensor: (B, C, M) tensor with gradients of features\n        \"\"\"\n        idx, weight, m = ctx.three_interpolate_for_backward\n        B, c, n = grad_out.size()\n\n        grad_features = grad_out.new_zeros(B, c, m)\n        grad_out_data = grad_out.data.contiguous()\n\n        ext_module.three_interpolate_backward(\n            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)\n        return grad_features, None, None\n\n\nthree_interpolate = ThreeInterpolate.apply\n"
  },
  {
    "path": "mmcv/ops/three_nn.py",
    "content": "from typing import Any, Tuple\n\nimport torch\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext', ['three_nn_forward'])\n\n\nclass ThreeNN(Function):\n    \"\"\"Find the top-3 nearest neighbors of the target set from the source set.\n\n    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_\n    for more details.\n    \"\"\"\n\n    @staticmethod\n    def forward(ctx: Any, target: torch.Tensor,\n                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:\n        \"\"\"\n        Args:\n            target (torch.Tensor): shape (B, N, 3), points set that needs to\n                find the nearest neighbors.\n            source (torch.Tensor): shape (B, M, 3), points set that is used\n                to find the nearest neighbors of points in target set.\n\n        Returns:\n            torch.Tensor: shape (B, N, 3), L2 distance of each point in target\n            set to their corresponding top three nearest neighbors.\n        \"\"\"\n        target = target.contiguous()\n        source = source.contiguous()\n\n        B, N, _ = target.size()\n        m = source.size(1)\n        if source.device.type == 'npu':\n            # strict to fp32\n            source = source.transpose(2, 1).contiguous()\n            dtype_ = source.dtype\n            if dtype_ == torch.float16:\n                target = target.float()\n                source = source.float()\n            dist2 = target.new_empty(B, N, 3)\n            idx = target.new_empty(B, N, 3, dtype=torch.int32)\n            ext_module.three_nn_forward(\n                target, source, dist2, idx, b=B, n=N, m=m)\n            dist2 = torch.sqrt(dist2)\n            if dtype_ == torch.float16:\n                dist2 = dist2.half()\n            return dist2, idx.int()\n        dist2 = target.new_empty(B, N, 3)\n        idx = target.new_empty(B, N, 3, dtype=torch.int32)\n\n        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)\n        if torch.__version__ != 'parrots':\n            ctx.mark_non_differentiable(idx)\n\n        return torch.sqrt(dist2), idx\n\n    @staticmethod\n    def backward(ctx, a=None, b=None):\n        return None, None\n\n\nthree_nn = ThreeNN.apply\n"
  },
  {
    "path": "mmcv/ops/tin_shift.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n# Code reference from \"Temporal Interlacing Network\"\n# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py\n# Hao Shao, Shengju Qian, Yu Liu\n# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Function\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext('_ext',\n                                 ['tin_shift_forward', 'tin_shift_backward'])\n\n\nclass TINShiftFunction(Function):\n\n    @staticmethod\n    def forward(ctx, input, shift):\n        if input.size(0) != shift.size(0):\n            raise ValueError(\n                'The first dim (batch) of `input` and `shift` should be '\n                f'same, but got {input.size(0)} and {shift.size(0)}.')\n        C = input.size(2)\n        num_segments = shift.size(1)\n        if C // num_segments <= 0 or C % num_segments != 0:\n            raise ValueError('C should be a multiple of num_segments, '\n                             f'but got C={C} and num_segments={num_segments}.')\n\n        ctx.save_for_backward(shift)\n\n        out = torch.zeros_like(input)\n        ext_module.tin_shift_forward(input, shift, out)\n\n        return out\n\n    @staticmethod\n    def backward(ctx, grad_output):\n\n        shift = ctx.saved_tensors[0]\n        data_grad_input = grad_output.new(*grad_output.size()).zero_()\n        shift_grad_input = shift.new(*shift.size()).zero_()\n        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)\n\n        return data_grad_input, shift_grad_input\n\n\ntin_shift = TINShiftFunction.apply\n\n\nclass TINShift(nn.Module):\n    \"\"\"Temporal Interlace Shift.\n\n    Temporal Interlace shift is a differentiable temporal-wise frame shifting\n    which is proposed in \"Temporal Interlacing Network\"\n\n    Please refer to\n    `Temporal Interlacing Network <https://arxiv.org/abs/2001.06499>`_\n     for more details.\n\n    Code is modified from\n    https://github.com/mit-han-lab/temporal-shift-module\n    \"\"\"\n\n    def forward(self, input, shift):\n        \"\"\"Perform temporal interlace shift.\n\n        Args:\n            input (torch.Tensor): Feature map with shape\n                [N, num_segments, C, H * W].\n            shift (torch.Tensor): Shift tensor with shape [N, num_segments].\n\n        Returns:\n            Feature map after temporal interlace shift.\n        \"\"\"\n        return tin_shift(input, shift)\n"
  },
  {
    "path": "mmcv/ops/upfirdn2d.py",
    "content": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors retain all intellectual property\n# and proprietary rights in and to this software, related documentation\n# and any modifications thereto.  Any use, reproduction, disclosure or\n# distribution of this software and related documentation without an express\n# license agreement from NVIDIA CORPORATION is strictly prohibited.\n\n# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/upfirdn2d.py # noqa\n\"\"\"Custom PyTorch ops for efficient resampling of 2D images.\"\"\"\nfrom typing import Dict, List, Union\n\nimport torch\n\nfrom ..utils import ext_loader\nfrom .conv2d_gradfix import conv2d\n\next_module = ext_loader.load_ext('_ext', ['upfirdn2d'])\n\n\ndef _parse_scaling(scaling):\n    \"\"\"Parse scaling into list [x, y]\"\"\"\n    if isinstance(scaling, int):\n        scaling = [scaling, scaling]\n    assert isinstance(scaling, (list, tuple))\n    assert all(isinstance(x, int) for x in scaling)\n    sx, sy = scaling\n    assert sx >= 1 and sy >= 1\n    return sx, sy\n\n\ndef _parse_padding(padding):\n    \"\"\"Parse padding into list [padx0, padx1, pady0, pady1]\"\"\"\n    if isinstance(padding, int):\n        padding = [padding, padding]\n    assert isinstance(padding, (list, tuple))\n    assert all(isinstance(x, int) for x in padding)\n    if len(padding) == 2:\n        padx, pady = padding\n        padding = [padx, padx, pady, pady]\n    padx0, padx1, pady0, pady1 = padding\n    return padx0, padx1, pady0, pady1\n\n\ndef _get_filter_size(filter):\n    \"\"\"Get width and height of filter kernel.\"\"\"\n    if filter is None:\n        return 1, 1\n    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]\n    fw = filter.shape[-1]\n    fh = filter.shape[0]\n    fw = int(fw)\n    fh = int(fh)\n    assert fw >= 1 and fh >= 1\n    return fw, fh\n\n\ndef upfirdn2d(input: torch.Tensor,\n              filter: torch.Tensor,\n              up: int = 1,\n              down: int = 1,\n              padding: Union[int, List[int]] = 0,\n              flip_filter: bool = False,\n              gain: Union[float, int] = 1,\n              use_custom_op: bool = True):\n    \"\"\"Pad, upsample, filter, and downsample a batch of 2D images.\n\n    Performs the following sequence of operations for each channel:\n\n    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).\n\n    2. Pad the image with the specified number of zeros on each side\n    (`padding`). Negative padding corresponds to cropping the image.\n\n    3. Convolve the image with the specified 2D FIR filter (`f`),\n    shrinking it so that the footprint of all output pixels lies within\n    the input image.\n\n    4. Downsample the image by keeping every Nth pixel (`down`).\n\n    This sequence of operations bears close resemblance to\n        scipy.signal.upfirdn().\n\n    The fused op is considerably more efficient than performing the same\n    calculation using standard PyTorch ops. It supports gradients of arbitrary\n    order.\n\n    Args:\n        input (torch.Tensor): Float32/float64/float16 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,\n            filter_width]` (non-separable), `[filter_taps]` (separable), or\n            `None` (identity).\n        up (int): Integer upsampling factor. Can be a single int or a\n            list/tuple `[x, y]`. Defaults to 1.\n        down (int): Integer downsampling factor. Can be a single int\n            or a list/tuple `[x, y]`. Defaults to 1.\n        padding (int | tuple[int]): Padding with respect to the upsampled\n            image. Can be a single number or a list/tuple `[x, y]` or\n            `[x_before, x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        gain (int): Overall scaling factor for signal magnitude.\n            Defaults to 1.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`\n    \"\"\"\n    assert isinstance(input, torch.Tensor)\n    if use_custom_op and input.device.type == 'cuda':\n        return _upfirdn2d_cuda(\n            up=up,\n            down=down,\n            padding=padding,\n            flip_filter=flip_filter,\n            gain=gain).apply(input, filter)\n    elif use_custom_op and input.device.type == 'musa':\n        return _upfirdn2d_musa(\n            up=up,\n            down=down,\n            padding=padding,\n            flip_filter=flip_filter,\n            gain=gain).apply(input, filter)\n    return _upfirdn2d_ref(\n        input,\n        filter,\n        up=up,\n        down=down,\n        padding=padding,\n        flip_filter=flip_filter,\n        gain=gain)\n\n\ndef _upfirdn2d_ref(input: torch.Tensor,\n                   filter: torch.Tensor,\n                   up: int = 1,\n                   down: int = 1,\n                   padding: Union[int, List[int]] = 0,\n                   flip_filter: bool = False,\n                   gain: Union[float, int] = 1):\n    \"\"\"Slow reference implementation of `upfirdn2d()` using standard PyTorch\n    ops.\n\n    Args:\n        input (torch.Tensor): Float32/float64/float16 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,\n            filter_width]` (non-separable), `[filter_taps]` (separable), or\n            `None` (identity).\n        up (int): Integer upsampling factor. Can be a single int or a\n            list/tuple `[x, y]`. Defaults to 1.\n        down (int): Integer downsampling factor. Can be a single int\n            or a list/tuple `[x, y]`. Defaults to 1.\n        padding (int | tuple[int]): Padding with respect to the upsampled\n            image. Can be a single number or a list/tuple `[x, y]` or\n            `[x_before, x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        gain (int): Overall scaling factor for signal magnitude.\n            Defaults to 1.\n\n    Returns:\n        torch.Tensor: Tensor of the shape `[batch_size, num_channels,\n            out_height, out_width]`.\n    \"\"\"\n    # Validate arguments.\n    assert isinstance(input, torch.Tensor) and input.ndim == 4\n    if filter is None:\n        filter = torch.ones([1, 1], dtype=torch.float32, device=input.device)\n    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]\n    assert filter.dtype == torch.float32 and not filter.requires_grad\n    batch_size, num_channels, in_height, in_width = input.shape\n    upx, upy = _parse_scaling(up)\n    downx, downy = _parse_scaling(down)\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n\n    # Check that upsampled buffer is not smaller than the filter.\n    upW = in_width * upx + padx0 + padx1\n    upH = in_height * upy + pady0 + pady1\n    assert upW >= filter.shape[-1] and upH >= filter.shape[0]\n\n    # Upsample by inserting zeros.\n    x = input.reshape([batch_size, num_channels, in_height, 1, in_width, 1])\n    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])\n    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])\n\n    # Pad or crop.\n    x = torch.nn.functional.pad(\n        x, [max(padx0, 0),\n            max(padx1, 0),\n            max(pady0, 0),\n            max(pady1, 0)])\n    x = x[:, :,\n          max(-pady0, 0):x.shape[2] - max(-pady1, 0),\n          max(-padx0, 0):x.shape[3] - max(-padx1, 0)]\n\n    # Setup filter.\n    filter = filter * (gain**(filter.ndim / 2))\n    filter = filter.to(x.dtype)\n    if not flip_filter:\n        filter = filter.flip(list(range(filter.ndim)))\n\n    # Convolve with the filter.\n    filter = filter[None, None].repeat([num_channels, 1] + [1] * filter.ndim)\n    if filter.ndim == 4:\n        x = conv2d(input=x, weight=filter, groups=num_channels)\n    else:\n        x = conv2d(input=x, weight=filter.unsqueeze(2), groups=num_channels)\n        x = conv2d(input=x, weight=filter.unsqueeze(3), groups=num_channels)\n\n    # Downsample by throwing away pixels.\n    x = x[:, :, ::downy, ::downx]\n    return x\n\n\n_upfirdn2d_cuda_cache: Dict = dict()\n\n\ndef _upfirdn2d_cuda(up: int = 1,\n                    down: int = 1,\n                    padding: Union[int, List[int]] = 0,\n                    flip_filter: bool = False,\n                    gain: Union[float, int] = 1):\n    \"\"\"Fast CUDA implementation of `upfirdn2d()` using custom ops.\n\n    Args:\n        up (int): Integer upsampling factor. Can be a single int or a\n            list/tuple `[x, y]`. Defaults to 1.\n        down (int): Integer downsampling factor. Can be a single int\n            or a list/tuple `[x, y]`. Defaults to 1.\n        padding (int | tuple[int]): Padding with respect to the upsampled\n            image. Can be a single number or a list/tuple `[x, y]` or\n            `[x_before, x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        gain (int): Overall scaling factor for signal magnitude.\n            Defaults to 1.\n\n    Returns:\n        torch.Tensor: Tensor of the shape `[batch_size, num_channels,\n        out_height, out_width]`\n    \"\"\"\n    # Parse arguments.\n    upx, upy = _parse_scaling(up)\n    downx, downy = _parse_scaling(down)\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n\n    # Lookup from cache.\n    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,\n           gain)\n    if key in _upfirdn2d_cuda_cache:\n        return _upfirdn2d_cuda_cache[key]\n\n    # Forward op.\n    class Upfirdn2dCuda(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, x, f):  # pylint: disable=arguments-differ\n            assert isinstance(x, torch.Tensor) and x.ndim == 4\n            if f is None:\n                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)\n            if f.ndim == 1 and f.shape[0] == 1:\n                f = f.square().unsqueeze(\n                    0)  # Convert separable-1 into full-1x1.\n            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]\n            y = x\n            if f.ndim == 2:\n                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,\n                                         padx1, pady0, pady1, flip_filter,\n                                         gain)\n            else:\n                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,\n                                         padx0, padx1, 0, 0, flip_filter, 1.0)\n                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,\n                                         0, 0, pady0, pady1, flip_filter, gain)\n            ctx.save_for_backward(f)\n            ctx.x_shape = x.shape\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            f, = ctx.saved_tensors\n            _, _, ih, iw = ctx.x_shape\n            _, _, oh, ow = dy.shape\n            fw, fh = _get_filter_size(f)\n            p = [\n                fw - padx0 - 1,\n                iw * upx - ow * downx + padx0 - upx + 1,\n                fh - pady0 - 1,\n                ih * upy - oh * downy + pady0 - upy + 1,\n            ]\n            dx = None\n            df = None\n\n            if ctx.needs_input_grad[0]:\n                dx = _upfirdn2d_cuda(\n                    up=down,\n                    down=up,\n                    padding=p,\n                    flip_filter=(not flip_filter),\n                    gain=gain).apply(dy, f)\n\n            assert not ctx.needs_input_grad[1]\n            return dx, df\n\n    # Add to cache.\n    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda\n    return Upfirdn2dCuda\n\n\n_upfirdn2d_musa_cache: Dict = dict()\n\n\ndef _upfirdn2d_musa(up: int = 1,\n                    down: int = 1,\n                    padding: Union[int, List[int]] = 0,\n                    flip_filter: bool = False,\n                    gain: Union[float, int] = 1):\n    \"\"\"Fast MUSA implementation of `upfirdn2d()` using custom ops.\n\n    Args:\n        up (int): Integer upsampling factor. Can be a single int or a\n            list/tuple `[x, y]`. Defaults to 1.\n        down (int): Integer downsampling factor. Can be a single int\n            or a list/tuple `[x, y]`. Defaults to 1.\n        padding (int | tuple[int]): Padding with respect to the upsampled\n            image. Can be a single number or a list/tuple `[x, y]` or\n            `[x_before, x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        gain (int): Overall scaling factor for signal magnitude.\n            Defaults to 1.\n\n    Returns:\n        torch.Tensor: Tensor of the shape `[batch_size, num_channels,\n        out_height, out_width]`\n    \"\"\"\n    # Parse arguments.\n    upx, upy = _parse_scaling(up)\n    downx, downy = _parse_scaling(down)\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n\n    # Lookup from cache.\n    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,\n           gain)\n    if key in _upfirdn2d_musa_cache:\n        return _upfirdn2d_musa_cache[key]\n\n    # Forward op.\n    class Upfirdn2dMusa(torch.autograd.Function):\n\n        @staticmethod\n        def forward(ctx, x, f):  # pylint: disable=arguments-differ\n            assert isinstance(x, torch.Tensor) and x.ndim == 4\n            if f is None:\n                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)\n            if f.ndim == 1 and f.shape[0] == 1:\n                f = f.square().unsqueeze(\n                    0)  # Convert separable-1 into full-1x1.\n            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]\n            y = x\n            if f.ndim == 2:\n                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,\n                                         padx1, pady0, pady1, flip_filter,\n                                         gain)\n            else:\n                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,\n                                         padx0, padx1, 0, 0, flip_filter, 1.0)\n                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,\n                                         0, 0, pady0, pady1, flip_filter, gain)\n            ctx.save_for_backward(f)\n            ctx.x_shape = x.shape\n            return y\n\n        @staticmethod\n        def backward(ctx, dy):  # pylint: disable=arguments-differ\n            f, = ctx.saved_tensors\n            _, _, ih, iw = ctx.x_shape\n            _, _, oh, ow = dy.shape\n            fw, fh = _get_filter_size(f)\n            p = [\n                fw - padx0 - 1,\n                iw * upx - ow * downx + padx0 - upx + 1,\n                fh - pady0 - 1,\n                ih * upy - oh * downy + pady0 - upy + 1,\n            ]\n            dx = None\n            df = None\n\n            if ctx.needs_input_grad[0]:\n                dx = _upfirdn2d_musa(\n                    up=down,\n                    down=up,\n                    padding=p,\n                    flip_filter=(not flip_filter),\n                    gain=gain).apply(dy, f)\n\n            assert not ctx.needs_input_grad[1]\n            return dx, df\n\n    # Add to cache.\n    _upfirdn2d_musa_cache[key] = Upfirdn2dMusa\n    return Upfirdn2dMusa\n\n\ndef filter2d(input: torch.Tensor,\n             filter: torch.Tensor,\n             padding: Union[int, List[int]] = 0,\n             flip_filter: bool = False,\n             gain: Union[float, int] = 1,\n             use_custom_op: bool = True):\n    \"\"\"Filter a batch of 2D images using the given 2D FIR filter.\n\n    By default, the result is padded so that its shape matches the input.\n    User-specified padding is applied on top of that, with negative values\n    indicating cropping. Pixels outside the image are assumed to be zero.\n\n    Args:\n        input (torch.Tensor): Float32/float64/float16 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,\n            filter_width]` (non-separable), `[filter_taps]` (separable), or\n            `None`.\n        padding (int | tuple[int]): Padding with respect to the output.\n            Can be a single number or a list/tuple `[x, y]` or `[x_before,\n            x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation.\n            Defaults to False.\n        gain (int): Overall scaling factor for signal magnitude.\n            Defaults to 1.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        Tensor of the shape `[batch_size, num_channels, out_height,\n        out_width]`.\n    \"\"\"\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n    fw, fh = _get_filter_size(filter)\n    p = [\n        padx0 + fw // 2,\n        padx1 + (fw - 1) // 2,\n        pady0 + fh // 2,\n        pady1 + (fh - 1) // 2,\n    ]\n    return upfirdn2d(\n        input,\n        filter,\n        padding=p,\n        flip_filter=flip_filter,\n        gain=gain,\n        use_custom_op=use_custom_op)\n\n\ndef upsample2d(input: torch.Tensor,\n               filter: torch.Tensor,\n               up: int = 2,\n               padding: Union[int, List[int]] = 0,\n               flip_filter: bool = False,\n               gain: Union[float, int] = 1,\n               use_custom_op: bool = True):\n    \"\"\"Upsample a batch of 2D images using the given 2D FIR filter.\n\n    By default, the result is padded so that its shape is a multiple of the\n    input.\n    User-specified padding is applied on top of that, with negative values\n    indicating cropping. Pixels outside the image are assumed to be zero.\n\n    Args:\n        input (torch.Tensor): Float32/float64/float16 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,\n            filter_width]` (non-separable), `[filter_taps]` (separable), or\n            `None` (identity).\n        up (int): Integer upsampling factor. Can be a single int or a\n            list/tuple `[x, y]`. Defaults to 2.\n        padding (int | tuple[int]): Padding with respect to the output.\n            Can be a single number or a list/tuple `[x, y]` or `[x_before,\n            x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation. Defaults\n            to False.\n        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        torch.Tensor: Tensor of the shape `[batch_size, num_channels,\n        out_height, out_width]`\n    \"\"\"\n    upx, upy = _parse_scaling(up)\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n    fw, fh = _get_filter_size(filter)\n    p = [\n        padx0 + (fw + upx - 1) // 2,\n        padx1 + (fw - upx) // 2,\n        pady0 + (fh + upy - 1) // 2,\n        pady1 + (fh - upy) // 2,\n    ]\n    return upfirdn2d(\n        input,\n        filter,\n        up=up,\n        padding=p,\n        flip_filter=flip_filter,\n        gain=gain * upx * upy,\n        use_custom_op=use_custom_op)\n\n\ndef downsample2d(input: torch.Tensor,\n                 filter: torch.Tensor,\n                 down: int = 2,\n                 padding: Union[int, List[int]] = 0,\n                 flip_filter: bool = False,\n                 gain: Union[float, int] = 1,\n                 use_custom_op: bool = True):\n    \"\"\"Downsample a batch of 2D images using the given 2D FIR filter.\n\n    By default, the result is padded so that its shape is a fraction of the\n    input.\n    User-specified padding is applied on top of that, with negative values\n    indicating cropping. Pixels outside the image are assumed to be zero.\n\n    Args:\n        input (torch.Tensor): Float32/float64/float16 input tensor of the shape\n            `[batch_size, num_channels, in_height, in_width]`.\n        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,\n            filter_width]` (non-separable), `[filter_taps]` (separable), or\n            `None` (identity).\n        down (int): Integer downsampling factor. Can be a single int or a\n                     list/tuple `[x, y]` (default: 1). Defaults to 2.\n        padding (int | tuple[int]): Padding with respect to the input.\n            Can be a single number or a list/tuple `[x, y]` or `[x_before,\n            x_after, y_before, y_after]`. Defaults to 0.\n        flip_filter (bool): False = convolution, True = correlation. Defaults\n            to False.\n        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.\n        use_custom_op (bool): Whether to use customized op.\n            Defaults to True.\n\n    Returns:\n        torch.Tensor: Tensor of the shape `[batch_size, num_channels,\n        out_height, out_width]`.\n    \"\"\"\n    downx, downy = _parse_scaling(down)\n    padx0, padx1, pady0, pady1 = _parse_padding(padding)\n    fw, fh = _get_filter_size(filter)\n    p = [\n        padx0 + (fw - downx + 1) // 2,\n        padx1 + (fw - downx) // 2,\n        pady0 + (fh - downy + 1) // 2,\n        pady1 + (fh - downy) // 2,\n    ]\n    return upfirdn2d(\n        input,\n        filter,\n        down=down,\n        padding=p,\n        flip_filter=flip_filter,\n        gain=gain,\n        use_custom_op=use_custom_op)\n"
  },
  {
    "path": "mmcv/ops/voxelize.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Any, List, Tuple, Union\n\nimport torch\nfrom torch import nn\nfrom torch.autograd import Function\nfrom torch.nn.modules.utils import _pair\n\nfrom ..utils import ext_loader\n\next_module = ext_loader.load_ext(\n    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])\n\n\nclass _Voxelization(Function):\n\n    @staticmethod\n    def forward(\n            ctx: Any,\n            points: torch.Tensor,\n            voxel_size: Union[tuple, float],\n            coors_range: Union[tuple, float],\n            max_points: int = 35,\n            max_voxels: int = 20000,\n            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:\n        \"\"\"Convert kitti points(N, >=3) to voxels.\n\n        Args:\n            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points\n                and points[:, 3:] contain other information like reflectivity.\n            voxel_size (tuple or float): The size of voxel with the shape of\n                [3].\n            coors_range (tuple or float): The coordinate range of voxel with\n                the shape of [6].\n            max_points (int, optional): maximum points contained in a voxel. if\n                max_points=-1, it means using dynamic_voxelize. Default: 35.\n            max_voxels (int, optional): maximum voxels this function create.\n                for second, 20000 is a good choice. Users should shuffle points\n                before call this function because max_voxels may drop points.\n                Default: 20000.\n            deterministic: bool. whether to invoke the non-deterministic\n                version of hard-voxelization implementations. non-deterministic\n                version is considerablly fast but is not deterministic. only\n                affects hard voxelization. default True. for more information\n                of this argument and the implementation insights, please refer\n                to the following links:\n                https://github.com/open-mmlab/mmdetection3d/issues/894\n                https://github.com/open-mmlab/mmdetection3d/pull/904\n                it is an experimental feature and we will appreciate it if\n                you could share with us the failing cases.\n\n        Returns:\n            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three\n            elements. The first one is the output voxels with the shape of\n            [M, max_points, n_dim], which only contain points and returned\n            when max_points != -1. The second is the voxel coordinates with\n            shape of [M, 3]. The last is number of point per voxel with the\n            shape of [M], which only returned when max_points != -1.\n        \"\"\"\n        if max_points == -1 or max_voxels == -1:\n            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)\n            ext_module.dynamic_voxelize_forward(\n                points,\n                torch.tensor(voxel_size, dtype=torch.float),\n                torch.tensor(coors_range, dtype=torch.float),\n                coors,\n                NDim=3)\n            return coors\n        else:\n            voxels = points.new_zeros(\n                size=(max_voxels, max_points, points.size(1)))\n            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)\n            num_points_per_voxel = points.new_zeros(\n                size=(max_voxels, ), dtype=torch.int)\n            voxel_num = torch.zeros(size=(), dtype=torch.long)\n            ext_module.hard_voxelize_forward(\n                points,\n                torch.tensor(voxel_size, dtype=torch.float),\n                torch.tensor(coors_range, dtype=torch.float),\n                voxels,\n                coors,\n                num_points_per_voxel,\n                voxel_num,\n                max_points=max_points,\n                max_voxels=max_voxels,\n                NDim=3,\n                deterministic=deterministic)\n            # select the valid voxels\n            voxels_out = voxels[:voxel_num]\n            coors_out = coors[:voxel_num]\n            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]\n            return voxels_out, coors_out, num_points_per_voxel_out\n\n\nvoxelization = _Voxelization.apply\n\n\nclass Voxelization(nn.Module):\n    \"\"\"Convert kitti points(N, >=3) to voxels.\n\n    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning\n    <https://arxiv.org/abs/1907.03739>`_ for more details.\n\n    Args:\n        voxel_size (tuple or float): The size of voxel with the shape of [3].\n        point_cloud_range (tuple or float): The coordinate range of voxel with\n            the shape of [6].\n        max_num_points (int): maximum points contained in a voxel. if\n            max_points=-1, it means using dynamic_voxelize.\n        max_voxels (int, optional): maximum voxels this function create.\n            for second, 20000 is a good choice. Users should shuffle points\n            before call this function because max_voxels may drop points.\n            Default: 20000.\n    \"\"\"\n\n    def __init__(self,\n                 voxel_size: List,\n                 point_cloud_range: List,\n                 max_num_points: int,\n                 max_voxels: Union[tuple, int] = 20000,\n                 deterministic: bool = True):\n        \"\"\"\n        Args:\n            voxel_size (list): list [x, y, z] size of three dimension\n            point_cloud_range (list):\n                [x_min, y_min, z_min, x_max, y_max, z_max]\n            max_num_points (int): max number of points per voxel\n            max_voxels (tuple or int): max number of voxels in\n                (training, testing) time\n            deterministic: bool. whether to invoke the non-deterministic\n                version of hard-voxelization implementations. non-deterministic\n                version is considerablly fast but is not deterministic. only\n                affects hard voxelization. default True. for more information\n                of this argument and the implementation insights, please refer\n                to the following links:\n                https://github.com/open-mmlab/mmdetection3d/issues/894\n                https://github.com/open-mmlab/mmdetection3d/pull/904\n                it is an experimental feature and we will appreciate it if\n                you could share with us the failing cases.\n        \"\"\"\n        super().__init__()\n\n        self.voxel_size = voxel_size\n        self.point_cloud_range = point_cloud_range\n        self.max_num_points = max_num_points\n        if isinstance(max_voxels, tuple):\n            self.max_voxels = max_voxels\n        else:\n            self.max_voxels = _pair(max_voxels)\n        self.deterministic = deterministic\n\n        point_cloud_range = torch.tensor(\n            point_cloud_range, dtype=torch.float32)\n        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)\n        grid_size = (\n            point_cloud_range[3:] -  # type: ignore\n            point_cloud_range[:3]) / voxel_size  # type: ignore\n        grid_size = torch.round(grid_size).long()\n        input_feat_shape = grid_size[:2]\n        self.grid_size = grid_size\n        # the origin shape is as [x-len, y-len, z-len]\n        # [w, h, d] -> [d, h, w]\n        self.pcd_shape = [*input_feat_shape, 1][::-1]\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        if self.training:\n            max_voxels = self.max_voxels[0]\n        else:\n            max_voxels = self.max_voxels[1]\n\n        return voxelization(input, self.voxel_size, self.point_cloud_range,\n                            self.max_num_points, max_voxels,\n                            self.deterministic)\n\n    def __repr__(self):\n        s = self.__class__.__name__ + '('\n        s += 'voxel_size=' + str(self.voxel_size)\n        s += ', point_cloud_range=' + str(self.point_cloud_range)\n        s += ', max_num_points=' + str(self.max_num_points)\n        s += ', max_voxels=' + str(self.max_voxels)\n        s += ', deterministic=' + str(self.deterministic)\n        s += ')'\n        return s\n"
  },
  {
    "path": "mmcv/transforms/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .base import BaseTransform\nfrom .builder import TRANSFORMS\nfrom .loading import LoadAnnotations, LoadImageFromFile\nfrom .processing import (CenterCrop, MultiScaleFlipAug, Normalize, Pad,\n                         RandomChoiceResize, RandomFlip, RandomGrayscale,\n                         RandomResize, Resize, TestTimeAug)\nfrom .wrappers import (Compose, KeyMapper, RandomApply, RandomChoice,\n                       TransformBroadcaster)\n\ntry:\n    import torch  # noqa: F401\nexcept ImportError:\n    __all__ = [\n        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',\n        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',\n        'Normalize', 'Resize', 'Pad', 'RandomFlip', 'RandomChoiceResize',\n        'CenterCrop', 'RandomGrayscale', 'MultiScaleFlipAug', 'RandomResize',\n        'RandomApply', 'TestTimeAug'\n    ]\nelse:\n    from .formatting import ImageToTensor, ToTensor, to_tensor\n\n    __all__ = [\n        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',\n        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',\n        'Normalize', 'Resize', 'Pad', 'ToTensor', 'to_tensor', 'ImageToTensor',\n        'RandomFlip', 'RandomChoiceResize', 'CenterCrop', 'RandomGrayscale',\n        'MultiScaleFlipAug', 'RandomResize', 'RandomApply', 'TestTimeAug'\n    ]\n"
  },
  {
    "path": "mmcv/transforms/base.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom abc import ABCMeta, abstractmethod\nfrom typing import Dict, List, Optional, Tuple, Union\n\n\nclass BaseTransform(metaclass=ABCMeta):\n    \"\"\"Base class for all transformations.\"\"\"\n\n    def __call__(self,\n                 results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:\n\n        return self.transform(results)\n\n    @abstractmethod\n    def transform(self,\n                  results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:\n        \"\"\"The transform function. All subclass of BaseTransform should\n        override this method.\n\n        This function takes the result dict as the input, and can add new\n        items to the dict or modify existing items in the dict. And the result\n        dict will be returned in the end, which allows to concate multiple\n        transforms into a pipeline.\n\n        Args:\n            results (dict): The result dict.\n\n        Returns:\n            dict: The result dict.\n        \"\"\"\n"
  },
  {
    "path": "mmcv/transforms/builder.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom mmengine.registry import TRANSFORMS  # noqa: F401\n"
  },
  {
    "path": "mmcv/transforms/formatting.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Sequence, Union\n\nimport mmengine\nimport numpy as np\nimport torch\n\nfrom .base import BaseTransform\nfrom .builder import TRANSFORMS\n\n\ndef to_tensor(\n    data: Union[torch.Tensor, np.ndarray, Sequence, int,\n                float]) -> torch.Tensor:\n    \"\"\"Convert objects of various python types to :obj:`torch.Tensor`.\n\n    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,\n    :class:`Sequence`, :class:`int` and :class:`float`.\n\n    Args:\n        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to\n            be converted.\n\n    Returns:\n        torch.Tensor: the converted data.\n    \"\"\"\n\n    if isinstance(data, torch.Tensor):\n        return data\n    elif isinstance(data, np.ndarray):\n        return torch.from_numpy(data)\n    elif isinstance(data, Sequence) and not mmengine.is_str(data):\n        return torch.tensor(data)\n    elif isinstance(data, int):\n        return torch.LongTensor([data])\n    elif isinstance(data, float):\n        return torch.FloatTensor([data])\n    else:\n        raise TypeError(f'type {type(data)} cannot be converted to tensor.')\n\n\n@TRANSFORMS.register_module()\nclass ToTensor(BaseTransform):\n    \"\"\"Convert some results to :obj:`torch.Tensor` by given keys.\n\n    Required keys:\n\n    - all these keys in `keys`\n\n    Modified Keys:\n\n    - all these keys in `keys`\n\n    Args:\n        keys (Sequence[str]): Keys that need to be converted to Tensor.\n    \"\"\"\n\n    def __init__(self, keys: Sequence[str]) -> None:\n        self.keys = keys\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Transform function to convert data to `torch.Tensor`.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n        Returns:\n            dict: `keys` in results will be updated.\n        \"\"\"\n        for key in self.keys:\n\n            key_list = key.split('.')\n            cur_item = results\n            for i in range(len(key_list)):\n                if key_list[i] not in cur_item:\n                    raise KeyError(f'Can not find key {key}')\n                if i == len(key_list) - 1:\n                    cur_item[key_list[i]] = to_tensor(cur_item[key_list[i]])\n                    break\n                cur_item = cur_item[key_list[i]]\n\n        return results\n\n    def __repr__(self) -> str:\n        return self.__class__.__name__ + f'(keys={self.keys})'\n\n\n@TRANSFORMS.register_module()\nclass ImageToTensor(BaseTransform):\n    \"\"\"Convert image to :obj:`torch.Tensor` by given keys.\n\n    The dimension order of input image is (H, W, C). The pipeline will convert\n    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be\n    (1, H, W).\n\n    Required keys:\n\n    - all these keys in `keys`\n\n    Modified Keys:\n\n    - all these keys in `keys`\n\n    Args:\n        keys (Sequence[str]): Key of images to be converted to Tensor.\n    \"\"\"\n\n    def __init__(self, keys: dict) -> None:\n        self.keys = keys\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Transform function to convert image in results to\n        :obj:`torch.Tensor` and transpose the channel order.\n\n        Args:\n            results (dict): Result dict contains the image data to convert.\n        Returns:\n            dict: The result dict contains the image converted\n            to :obj:``torch.Tensor`` and transposed to (C, H, W) order.\n        \"\"\"\n        for key in self.keys:\n            img = results[key]\n            if len(img.shape) < 3:\n                img = np.expand_dims(img, -1)\n            results[key] = (to_tensor(img.transpose(2, 0, 1))).contiguous()\n        return results\n\n    def __repr__(self) -> str:\n        return self.__class__.__name__ + f'(keys={self.keys})'\n"
  },
  {
    "path": "mmcv/transforms/loading.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom typing import Optional\n\nimport mmengine.fileio as fileio\nimport numpy as np\n\nimport mmcv\nfrom .base import BaseTransform\nfrom .builder import TRANSFORMS\n\n\n@TRANSFORMS.register_module()\nclass LoadImageFromFile(BaseTransform):\n    \"\"\"Load an image from file.\n\n    Required Keys:\n\n    - img_path\n\n    Modified Keys:\n\n    - img\n    - img_shape\n    - ori_shape\n\n    Args:\n        to_float32 (bool): Whether to convert the loaded image to a float32\n            numpy array. If set to False, the loaded image is an uint8 array.\n            Defaults to False.\n        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.\n            Defaults to 'color'.\n        imdecode_backend (str): The image decoding backend type. The backend\n            argument for :func:`mmcv.imfrombytes`.\n            See :func:`mmcv.imfrombytes` for details.\n            Defaults to 'cv2'.\n        file_client_args (dict, optional): Arguments to instantiate a\n            FileClient. See :class:`mmengine.fileio.FileClient` for details.\n            Defaults to None. It will be deprecated in future. Please use\n            ``backend_args`` instead.\n            Deprecated in version 2.0.0rc4.\n        ignore_empty (bool): Whether to allow loading empty image or file path\n            not existent. Defaults to False.\n        backend_args (dict, optional): Instantiates the corresponding file\n            backend. It may contain `backend` key to specify the file\n            backend. If it contains, the file backend corresponding to this\n            value will be used and initialized with the remaining values,\n            otherwise the corresponding file backend will be selected\n            based on the prefix of the file path. Defaults to None.\n            New in version 2.0.0rc4.\n    \"\"\"\n\n    def __init__(self,\n                 to_float32: bool = False,\n                 color_type: str = 'color',\n                 imdecode_backend: str = 'cv2',\n                 file_client_args: Optional[dict] = None,\n                 ignore_empty: bool = False,\n                 *,\n                 backend_args: Optional[dict] = None) -> None:\n        self.ignore_empty = ignore_empty\n        self.to_float32 = to_float32\n        self.color_type = color_type\n        self.imdecode_backend = imdecode_backend\n\n        self.file_client_args: Optional[dict] = None\n        self.backend_args: Optional[dict] = None\n        if file_client_args is not None:\n            warnings.warn(\n                '\"file_client_args\" will be deprecated in future. '\n                'Please use \"backend_args\" instead', DeprecationWarning)\n            if backend_args is not None:\n                raise ValueError(\n                    '\"file_client_args\" and \"backend_args\" cannot be set '\n                    'at the same time.')\n\n            self.file_client_args = file_client_args.copy()\n        if backend_args is not None:\n            self.backend_args = backend_args.copy()\n\n    def transform(self, results: dict) -> Optional[dict]:\n        \"\"\"Functions to load image.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded image and meta information.\n        \"\"\"\n\n        filename = results['img_path']\n        try:\n            if self.file_client_args is not None:\n                file_client = fileio.FileClient.infer_client(\n                    self.file_client_args, filename)\n                img_bytes = file_client.get(filename)\n            else:\n                img_bytes = fileio.get(\n                    filename, backend_args=self.backend_args)\n            img = mmcv.imfrombytes(\n                img_bytes, flag=self.color_type, backend=self.imdecode_backend)\n        except Exception as e:\n            if self.ignore_empty:\n                return None\n            else:\n                raise e\n        # in some cases, images are not read successfully, the img would be\n        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427\n        assert img is not None, f'failed to load image: {filename}'\n        if self.to_float32:\n            img = img.astype(np.float32)\n\n        results['img'] = img\n        results['img_shape'] = img.shape[:2]\n        results['ori_shape'] = img.shape[:2]\n        return results\n\n    def __repr__(self):\n        repr_str = (f'{self.__class__.__name__}('\n                    f'ignore_empty={self.ignore_empty}, '\n                    f'to_float32={self.to_float32}, '\n                    f\"color_type='{self.color_type}', \"\n                    f\"imdecode_backend='{self.imdecode_backend}', \")\n\n        if self.file_client_args is not None:\n            repr_str += f'file_client_args={self.file_client_args})'\n        else:\n            repr_str += f'backend_args={self.backend_args})'\n\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass LoadAnnotations(BaseTransform):\n    \"\"\"Load and process the ``instances`` and ``seg_map`` annotation provided\n    by dataset.\n\n    The annotation format is as the following:\n\n    .. code-block:: python\n\n        {\n            'instances':\n            [\n                {\n                # List of 4 numbers representing the bounding box of the\n                # instance, in (x1, y1, x2, y2) order.\n                'bbox': [x1, y1, x2, y2],\n\n                # Label of image classification.\n                'bbox_label': 1,\n\n                # Used in key point detection.\n                # Can only load the format of [x1, y1, v1,…, xn, yn, vn]. v[i]\n                # means the visibility of this keypoint. n must be equal to the\n                # number of keypoint categories.\n                'keypoints': [x1, y1, v1, ..., xn, yn, vn]\n                }\n            ]\n            # Filename of semantic or panoptic segmentation ground truth file.\n            'seg_map_path': 'a/b/c'\n        }\n\n    After this module, the annotation has been changed to the format below:\n\n    .. code-block:: python\n\n        {\n            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes\n            # in np.float32\n            'gt_bboxes': np.ndarray(N, 4)\n             # In np.int64 type.\n            'gt_bboxes_labels': np.ndarray(N, )\n             # In uint8 type.\n            'gt_seg_map': np.ndarray (H, W)\n             # with (x, y, v) order, in np.float32 type.\n            'gt_keypoints': np.ndarray(N, NK, 3)\n        }\n\n    Required Keys:\n\n    - instances\n\n      - bbox (optional)\n      - bbox_label\n      - keypoints (optional)\n\n    - seg_map_path (optional)\n\n    Added Keys:\n\n    - gt_bboxes (np.float32)\n    - gt_bboxes_labels (np.int64)\n    - gt_seg_map (np.uint8)\n    - gt_keypoints (np.float32)\n\n    Args:\n        with_bbox (bool): Whether to parse and load the bbox annotation.\n            Defaults to True.\n        with_label (bool): Whether to parse and load the label annotation.\n            Defaults to True.\n        with_seg (bool): Whether to parse and load the semantic segmentation\n            annotation. Defaults to False.\n        with_keypoints (bool): Whether to parse and load the keypoints\n            annotation. Defaults to False.\n        imdecode_backend (str): The image decoding backend type. The backend\n            argument for :func:`mmcv.imfrombytes`.\n            See :func:`mmcv.imfrombytes` for details.\n            Defaults to 'cv2'.\n        file_client_args (dict, optional): Arguments to instantiate a\n            FileClient. See :class:`mmengine.fileio.FileClient` for details.\n            Defaults to None. It will be deprecated in future. Please use\n            ``backend_args`` instead.\n            Deprecated in version 2.0.0rc4.\n        backend_args (dict, optional): Instantiates the corresponding file\n            backend. It may contain `backend` key to specify the file\n            backend. If it contains, the file backend corresponding to this\n            value will be used and initialized with the remaining values,\n            otherwise the corresponding file backend will be selected\n            based on the prefix of the file path. Defaults to None.\n            New in version 2.0.0rc4.\n    \"\"\"\n\n    def __init__(\n        self,\n        with_bbox: bool = True,\n        with_label: bool = True,\n        with_seg: bool = False,\n        with_keypoints: bool = False,\n        imdecode_backend: str = 'cv2',\n        file_client_args: Optional[dict] = None,\n        *,\n        backend_args: Optional[dict] = None,\n    ) -> None:\n        super().__init__()\n        self.with_bbox = with_bbox\n        self.with_label = with_label\n        self.with_seg = with_seg\n        self.with_keypoints = with_keypoints\n        self.imdecode_backend = imdecode_backend\n\n        self.file_client_args: Optional[dict] = None\n        self.backend_args: Optional[dict] = None\n        if file_client_args is not None:\n            warnings.warn(\n                '\"file_client_args\" will be deprecated in future. '\n                'Please use \"backend_args\" instead', DeprecationWarning)\n            if backend_args is not None:\n                raise ValueError(\n                    '\"file_client_args\" and \"backend_args\" cannot be set '\n                    'at the same time.')\n\n            self.file_client_args = file_client_args.copy()\n        if backend_args is not None:\n            self.backend_args = backend_args.copy()\n\n    def _load_bboxes(self, results: dict) -> None:\n        \"\"\"Private function to load bounding box annotations.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded bounding box annotations.\n        \"\"\"\n        gt_bboxes = []\n        for instance in results['instances']:\n            gt_bboxes.append(instance['bbox'])\n        results['gt_bboxes'] = np.array(\n            gt_bboxes, dtype=np.float32).reshape(-1, 4)\n\n    def _load_labels(self, results: dict) -> None:\n        \"\"\"Private function to load label annotations.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded label annotations.\n        \"\"\"\n        gt_bboxes_labels = []\n        for instance in results['instances']:\n            gt_bboxes_labels.append(instance['bbox_label'])\n        results['gt_bboxes_labels'] = np.array(\n            gt_bboxes_labels, dtype=np.int64)\n\n    def _load_seg_map(self, results: dict) -> None:\n        \"\"\"Private function to load semantic segmentation annotations.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded semantic segmentation annotations.\n        \"\"\"\n        if self.file_client_args is not None:\n            file_client = fileio.FileClient.infer_client(\n                self.file_client_args, results['seg_map_path'])\n            img_bytes = file_client.get(results['seg_map_path'])\n        else:\n            img_bytes = fileio.get(\n                results['seg_map_path'], backend_args=self.backend_args)\n\n        results['gt_seg_map'] = mmcv.imfrombytes(\n            img_bytes, flag='unchanged',\n            backend=self.imdecode_backend).squeeze()\n\n    def _load_kps(self, results: dict) -> None:\n        \"\"\"Private function to load keypoints annotations.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded keypoints annotations.\n        \"\"\"\n        gt_keypoints = []\n        for instance in results['instances']:\n            gt_keypoints.append(instance['keypoints'])\n        results['gt_keypoints'] = np.array(gt_keypoints, np.float32).reshape(\n            (len(gt_keypoints), -1, 3))\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Function to load multiple types annotations.\n\n        Args:\n            results (dict): Result dict from\n                :class:`mmengine.dataset.BaseDataset`.\n\n        Returns:\n            dict: The dict contains loaded bounding box, label and\n            semantic segmentation and keypoints annotations.\n        \"\"\"\n\n        if self.with_bbox:\n            self._load_bboxes(results)\n        if self.with_label:\n            self._load_labels(results)\n        if self.with_seg:\n            self._load_seg_map(results)\n        if self.with_keypoints:\n            self._load_kps(results)\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(with_bbox={self.with_bbox}, '\n        repr_str += f'with_label={self.with_label}, '\n        repr_str += f'with_seg={self.with_seg}, '\n        repr_str += f'with_keypoints={self.with_keypoints}, '\n        repr_str += f\"imdecode_backend='{self.imdecode_backend}', \"\n\n        if self.file_client_args is not None:\n            repr_str += f'file_client_args={self.file_client_args})'\n        else:\n            repr_str += f'backend_args={self.backend_args})'\n\n        return repr_str\n"
  },
  {
    "path": "mmcv/transforms/processing.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport random\nimport warnings\nfrom itertools import product\nfrom typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union\n\nimport mmengine\nimport numpy as np\n\nimport mmcv\nfrom mmcv.image.geometric import _scale_size\nfrom .base import BaseTransform\nfrom .builder import TRANSFORMS\nfrom .utils import cache_randomness\nfrom .wrappers import Compose\n\nNumber = Union[int, float]\n\n\n@TRANSFORMS.register_module()\nclass Normalize(BaseTransform):\n    \"\"\"Normalize the image.\n\n    Required Keys:\n\n    - img\n\n    Modified Keys:\n\n    - img\n\n    Added Keys:\n\n    - img_norm_cfg\n\n      - mean\n      - std\n      - to_rgb\n\n\n    Args:\n        mean (sequence): Mean values of 3 channels.\n        std (sequence): Std values of 3 channels.\n        to_rgb (bool): Whether to convert the image from BGR to RGB before\n            normlizing the image. If ``to_rgb=True``, the order of mean and std\n            should be RGB. If ``to_rgb=False``, the order of mean and std\n            should be the same order of the image. Defaults to True.\n    \"\"\"\n\n    def __init__(self,\n                 mean: Sequence[Number],\n                 std: Sequence[Number],\n                 to_rgb: bool = True) -> None:\n        self.mean = np.array(mean, dtype=np.float32)\n        self.std = np.array(std, dtype=np.float32)\n        self.to_rgb = to_rgb\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Function to normalize images.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Normalized results, key 'img_norm_cfg' key is added in to\n            result dict.\n        \"\"\"\n\n        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,\n                                          self.to_rgb)\n        results['img_norm_cfg'] = dict(\n            mean=self.mean, std=self.std, to_rgb=self.to_rgb)\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass Resize(BaseTransform):\n    \"\"\"Resize images & bbox & seg & keypoints.\n\n    This transform resizes the input image according to ``scale`` or\n    ``scale_factor``. Bboxes, seg map and keypoints are then resized with the\n    same scale factor.\n    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to\n    resize.\n\n    Required Keys:\n\n    - img\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n    - gt_keypoints (optional)\n\n    Modified Keys:\n\n    - img\n    - gt_bboxes\n    - gt_seg_map\n    - gt_keypoints\n    - img_shape\n\n    Added Keys:\n\n    - scale\n    - scale_factor\n    - keep_ratio\n\n    Args:\n        scale (int or tuple): Images scales for resizing. Defaults to None\n        scale_factor (float or tuple[float]): Scale factors for resizing.\n            Defaults to None.\n        keep_ratio (bool): Whether to keep the aspect ratio when resizing the\n            image. Defaults to False.\n        clip_object_border (bool): Whether to clip the objects\n            outside the border of the image. In some dataset like MOT17, the gt\n            bboxes are allowed to cross the border of images. Therefore, we\n            don't need to clip the gt bboxes in these cases. Defaults to True.\n        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.\n            These two backends generates slightly different results. Defaults\n            to 'cv2'.\n        interpolation (str): Interpolation method, accepted values are\n            \"nearest\", \"bilinear\", \"bicubic\", \"area\", \"lanczos\" for 'cv2'\n            backend, \"nearest\", \"bilinear\" for 'pillow' backend. Defaults\n            to 'bilinear'.\n    \"\"\"\n\n    def __init__(self,\n                 scale: Optional[Union[int, Tuple[int, int]]] = None,\n                 scale_factor: Optional[Union[float, Tuple[float,\n                                                           float]]] = None,\n                 keep_ratio: bool = False,\n                 clip_object_border: bool = True,\n                 backend: str = 'cv2',\n                 interpolation='bilinear') -> None:\n        assert scale is not None or scale_factor is not None, (\n            '`scale` and'\n            '`scale_factor` can not both be `None`')\n        if scale is None:\n            self.scale = None\n        else:\n            if isinstance(scale, int):\n                self.scale = (scale, scale)\n            else:\n                self.scale = scale\n\n        self.backend = backend\n        self.interpolation = interpolation\n        self.keep_ratio = keep_ratio\n        self.clip_object_border = clip_object_border\n        if scale_factor is None:\n            self.scale_factor = None\n        elif isinstance(scale_factor, float):\n            self.scale_factor = (scale_factor, scale_factor)\n        elif isinstance(scale_factor, tuple):\n            assert (len(scale_factor)) == 2\n            self.scale_factor = scale_factor\n        else:\n            raise TypeError(\n                f'expect scale_factor is float or Tuple(float), but'\n                f'get {type(scale_factor)}')\n\n    def _resize_img(self, results: dict) -> None:\n        \"\"\"Resize images with ``results['scale']``.\"\"\"\n\n        if results.get('img', None) is not None:\n            if self.keep_ratio:\n                img, scale_factor = mmcv.imrescale(\n                    results['img'],\n                    results['scale'],\n                    interpolation=self.interpolation,\n                    return_scale=True,\n                    backend=self.backend)\n                # the w_scale and h_scale has minor difference\n                # a real fix should be done in the mmcv.imrescale in the future\n                new_h, new_w = img.shape[:2]\n                h, w = results['img'].shape[:2]\n                w_scale = new_w / w\n                h_scale = new_h / h\n            else:\n                img, w_scale, h_scale = mmcv.imresize(\n                    results['img'],\n                    results['scale'],\n                    interpolation=self.interpolation,\n                    return_scale=True,\n                    backend=self.backend)\n            results['img'] = img\n            results['img_shape'] = img.shape[:2]\n            results['scale_factor'] = (w_scale, h_scale)\n            results['keep_ratio'] = self.keep_ratio\n\n    def _resize_bboxes(self, results: dict) -> None:\n        \"\"\"Resize bounding boxes with ``results['scale_factor']``.\"\"\"\n        if results.get('gt_bboxes', None) is not None:\n            bboxes = results['gt_bboxes'] * np.tile(\n                np.array(results['scale_factor']), 2)\n            if self.clip_object_border:\n                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0,\n                                          results['img_shape'][1])\n                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0,\n                                          results['img_shape'][0])\n            results['gt_bboxes'] = bboxes\n\n    def _resize_seg(self, results: dict) -> None:\n        \"\"\"Resize semantic segmentation map with ``results['scale']``.\"\"\"\n        if results.get('gt_seg_map', None) is not None:\n            if self.keep_ratio:\n                gt_seg = mmcv.imrescale(\n                    results['gt_seg_map'],\n                    results['scale'],\n                    interpolation='nearest',\n                    backend=self.backend)\n            else:\n                gt_seg = mmcv.imresize(\n                    results['gt_seg_map'],\n                    results['scale'],\n                    interpolation='nearest',\n                    backend=self.backend)\n            results['gt_seg_map'] = gt_seg\n\n    def _resize_keypoints(self, results: dict) -> None:\n        \"\"\"Resize keypoints with ``results['scale_factor']``.\"\"\"\n        if results.get('gt_keypoints', None) is not None:\n            keypoints = results['gt_keypoints']\n\n            keypoints[:, :, :2] = keypoints[:, :, :2] * np.array(\n                results['scale_factor'])\n            if self.clip_object_border:\n                keypoints[:, :, 0] = np.clip(keypoints[:, :, 0], 0,\n                                             results['img_shape'][1])\n                keypoints[:, :, 1] = np.clip(keypoints[:, :, 1], 0,\n                                             results['img_shape'][0])\n            results['gt_keypoints'] = keypoints\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Transform function to resize images, bounding boxes, semantic\n        segmentation map and keypoints.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n        Returns:\n            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',\n            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',\n            and 'keep_ratio' keys are updated in result dict.\n        \"\"\"\n\n        if self.scale:\n            results['scale'] = self.scale\n        else:\n            img_shape = results['img'].shape[:2]\n            results['scale'] = _scale_size(img_shape[::-1],\n                                           self.scale_factor)  # type: ignore\n        self._resize_img(results)\n        self._resize_bboxes(results)\n        self._resize_seg(results)\n        self._resize_keypoints(results)\n        return results\n\n    def __repr__(self):\n        repr_str = self.__class__.__name__\n        repr_str += f'(scale={self.scale}, '\n        repr_str += f'scale_factor={self.scale_factor}, '\n        repr_str += f'keep_ratio={self.keep_ratio}, '\n        repr_str += f'clip_object_border={self.clip_object_border}), '\n        repr_str += f'backend={self.backend}), '\n        repr_str += f'interpolation={self.interpolation})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass Pad(BaseTransform):\n    \"\"\"Pad the image & segmentation map.\n\n    There are three padding modes: (1) pad to a fixed size and (2) pad to the\n    minimum size that is divisible by some number. and (3)pad to square. Also,\n    pad to square and pad to the minimum size can be used as the same time.\n\n    Required Keys:\n\n    - img\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n\n    Modified Keys:\n\n    - img\n    - gt_seg_map\n    - img_shape\n\n    Added Keys:\n\n    - pad_shape\n    - pad_fixed_size\n    - pad_size_divisor\n\n    Args:\n        size (tuple, optional): Fixed padding size.\n            Expected padding shape (w, h). Defaults to None.\n        size_divisor (int, optional): The divisor of padded size. Defaults to\n            None.\n        pad_to_square (bool): Whether to pad the image into a square.\n            Currently only used for YOLOX. Defaults to False.\n        pad_val (Number | dict[str, Number], optional): Padding value for if\n            the pad_mode is \"constant\". If it is a single number, the value\n            to pad the image is the number and to pad the semantic\n            segmentation map is 255. If it is a dict, it should have the\n            following keys:\n\n            - img: The value to pad the image.\n            - seg: The value to pad the semantic segmentation map.\n\n            Defaults to dict(img=0, seg=255).\n        padding_mode (str): Type of padding. Should be: constant, edge,\n            reflect or symmetric. Defaults to 'constant'.\n\n            - constant: pads with a constant value, this value is specified\n              with pad_val.\n            - edge: pads with the last value at the edge of the image.\n            - reflect: pads with reflection of image without repeating the last\n              value on the edge. For example, padding [1, 2, 3, 4] with 2\n              elements on both sides in reflect mode will result in\n              [3, 2, 1, 2, 3, 4, 3, 2].\n            - symmetric: pads with reflection of image repeating the last value\n              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on\n              both sides in symmetric mode will result in\n              [2, 1, 1, 2, 3, 4, 4, 3]\n    \"\"\"\n\n    def __init__(self,\n                 size: Optional[Tuple[int, int]] = None,\n                 size_divisor: Optional[int] = None,\n                 pad_to_square: bool = False,\n                 pad_val: Union[Number, dict] = dict(img=0, seg=255),\n                 padding_mode: str = 'constant') -> None:\n        self.size = size\n        self.size_divisor = size_divisor\n        if isinstance(pad_val, int):\n            pad_val = dict(img=pad_val, seg=255)\n        assert isinstance(pad_val, dict), 'pad_val '\n        self.pad_val = pad_val\n        self.pad_to_square = pad_to_square\n\n        if pad_to_square:\n            assert size is None, \\\n                'The size and size_divisor must be None ' \\\n                'when pad2square is True'\n        else:\n            assert size is not None or size_divisor is not None, \\\n                'only one of size and size_divisor should be valid'\n            assert size is None or size_divisor is None\n        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']\n        self.padding_mode = padding_mode\n\n    def _pad_img(self, results: dict) -> None:\n        \"\"\"Pad images according to ``self.size``.\"\"\"\n        pad_val = self.pad_val.get('img', 0)\n\n        size = None\n        if self.pad_to_square:\n            max_size = max(results['img'].shape[:2])\n            size = (max_size, max_size)\n        if self.size_divisor is not None:\n            if size is None:\n                size = (results['img'].shape[0], results['img'].shape[1])\n            pad_h = int(np.ceil(\n                size[0] / self.size_divisor)) * self.size_divisor\n            pad_w = int(np.ceil(\n                size[1] / self.size_divisor)) * self.size_divisor\n            size = (pad_h, pad_w)\n        elif self.size is not None:\n            size = self.size[::-1]\n        if isinstance(pad_val, int) and results['img'].ndim == 3:\n            pad_val = tuple(pad_val for _ in range(results['img'].shape[2]))\n        padded_img = mmcv.impad(\n            results['img'],\n            shape=size,\n            pad_val=pad_val,\n            padding_mode=self.padding_mode)\n\n        results['img'] = padded_img\n        results['pad_shape'] = padded_img.shape\n        results['pad_fixed_size'] = self.size\n        results['pad_size_divisor'] = self.size_divisor\n        results['img_shape'] = padded_img.shape[:2]\n\n    def _pad_seg(self, results: dict) -> None:\n        \"\"\"Pad semantic segmentation map according to\n        ``results['pad_shape']``.\"\"\"\n        if results.get('gt_seg_map', None) is not None:\n            pad_val = self.pad_val.get('seg', 255)\n            if isinstance(pad_val, int) and results['gt_seg_map'].ndim == 3:\n                pad_val = tuple(\n                    pad_val for _ in range(results['gt_seg_map'].shape[2]))\n            results['gt_seg_map'] = mmcv.impad(\n                results['gt_seg_map'],\n                shape=results['pad_shape'][:2],\n                pad_val=pad_val,\n                padding_mode=self.padding_mode)\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Call function to pad images, masks, semantic segmentation maps.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Updated result dict.\n        \"\"\"\n        self._pad_img(results)\n        self._pad_seg(results)\n        return results\n\n    def __repr__(self):\n        repr_str = self.__class__.__name__\n        repr_str += f'(size={self.size}, '\n        repr_str += f'size_divisor={self.size_divisor}, '\n        repr_str += f'pad_to_square={self.pad_to_square}, '\n        repr_str += f'pad_val={self.pad_val}), '\n        repr_str += f'padding_mode={self.padding_mode})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass CenterCrop(BaseTransform):\n    \"\"\"Crop the center of the image, segmentation masks, bounding boxes and key\n    points. If the crop area exceeds the original image and ``auto_pad`` is\n    True, the original image will be padded before cropping.\n\n    Required Keys:\n\n    - img\n    - gt_seg_map (optional)\n    - gt_bboxes (optional)\n    - gt_keypoints (optional)\n\n    Modified Keys:\n\n    - img\n    - img_shape\n    - gt_seg_map (optional)\n    - gt_bboxes (optional)\n    - gt_keypoints (optional)\n\n    Added Key:\n\n    - pad_shape\n\n\n    Args:\n        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping\n            with the format of (w, h). If set to an integer, then cropping\n            width and height are equal to this integer.\n        auto_pad (bool): Whether to pad the image if it's smaller than the\n            ``crop_size``. Defaults to False.\n        pad_cfg (dict): Base config for padding. Refer to ``mmcv.Pad`` for\n            detail. Defaults to ``dict(type='Pad')``.\n        clip_object_border (bool): Whether to clip the objects\n            outside the border of the image. In some dataset like MOT17, the\n            gt bboxes are allowed to cross the border of images. Therefore,\n            we don't need to clip the gt bboxes in these cases.\n            Defaults to True.\n    \"\"\"\n\n    def __init__(self,\n                 crop_size: Union[int, Tuple[int, int]],\n                 auto_pad: bool = False,\n                 pad_cfg: dict = dict(type='Pad'),\n                 clip_object_border: bool = True) -> None:\n        super().__init__()\n        assert isinstance(crop_size, int) or (\n            isinstance(crop_size, tuple) and len(crop_size) == 2\n        ), 'The expected crop_size is an integer, or a tuple containing two '\n        'intergers'\n\n        if isinstance(crop_size, int):\n            crop_size = (crop_size, crop_size)\n        assert crop_size[0] > 0 and crop_size[1] > 0\n        self.crop_size = crop_size\n        self.auto_pad = auto_pad\n\n        self.pad_cfg = pad_cfg.copy()\n        # size will be overwritten\n        if 'size' in self.pad_cfg and auto_pad:\n            warnings.warn('``size`` is set in ``pad_cfg``,'\n                          'however this argument will be overwritten'\n                          ' according to crop size and image size')\n\n        self.clip_object_border = clip_object_border\n\n    def _crop_img(self, results: dict, bboxes: np.ndarray) -> None:\n        \"\"\"Crop image.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.\n        \"\"\"\n        if results.get('img', None) is not None:\n            img = mmcv.imcrop(results['img'], bboxes=bboxes)\n            img_shape = img.shape[:2]  # type: ignore\n            results['img'] = img\n            results['img_shape'] = img_shape\n            results['pad_shape'] = img_shape\n\n    def _crop_seg_map(self, results: dict, bboxes: np.ndarray) -> None:\n        \"\"\"Crop semantic segmentation map.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.\n        \"\"\"\n        if results.get('gt_seg_map', None) is not None:\n            img = mmcv.imcrop(results['gt_seg_map'], bboxes=bboxes)\n            results['gt_seg_map'] = img\n\n    def _crop_bboxes(self, results: dict, bboxes: np.ndarray) -> None:\n        \"\"\"Update bounding boxes according to CenterCrop.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.\n        \"\"\"\n        if 'gt_bboxes' in results:\n            offset_w = bboxes[0]\n            offset_h = bboxes[1]\n            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h])\n            # gt_bboxes has shape (num_gts, 4) in (tl_x, tl_y, br_x, br_y)\n            # order.\n            gt_bboxes = results['gt_bboxes'] - bbox_offset\n            if self.clip_object_border:\n                gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0,\n                                             results['img'].shape[1])\n                gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0,\n                                             results['img'].shape[0])\n            results['gt_bboxes'] = gt_bboxes\n\n    def _crop_keypoints(self, results: dict, bboxes: np.ndarray) -> None:\n        \"\"\"Update key points according to CenterCrop. Keypoints that not in the\n        cropped image will be set invisible.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.\n        \"\"\"\n        if 'gt_keypoints' in results:\n            offset_w = bboxes[0]\n            offset_h = bboxes[1]\n            keypoints_offset = np.array([offset_w, offset_h, 0])\n            # gt_keypoints has shape (N, NK, 3) in (x, y, visibility) order,\n            # NK = number of points per object\n            gt_keypoints = results['gt_keypoints'] - keypoints_offset\n            # set gt_kepoints out of the result image invisible\n            height, width = results['img'].shape[:2]\n            valid_pos = (gt_keypoints[:, :, 0] >=\n                         0) * (gt_keypoints[:, :, 0] <\n                               width) * (gt_keypoints[:, :, 1] >= 0) * (\n                                   gt_keypoints[:, :, 1] < height)\n            gt_keypoints[:, :, 2] = np.where(valid_pos, gt_keypoints[:, :, 2],\n                                             0)\n            gt_keypoints[:, :, 0] = np.clip(gt_keypoints[:, :, 0], 0,\n                                            results['img'].shape[1])\n            gt_keypoints[:, :, 1] = np.clip(gt_keypoints[:, :, 1], 0,\n                                            results['img'].shape[0])\n            results['gt_keypoints'] = gt_keypoints\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Apply center crop on results.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n\n        Returns:\n            dict: Results with CenterCropped image and semantic segmentation\n            map.\n        \"\"\"\n        crop_width, crop_height = self.crop_size[0], self.crop_size[1]\n\n        assert 'img' in results, '`img` is not found in results'\n        img = results['img']\n        # img.shape has length 2 for grayscale, length 3 for color\n        img_height, img_width = img.shape[:2]\n\n        if crop_height > img_height or crop_width > img_width:\n            if self.auto_pad:\n                # pad the area\n                img_height = max(img_height, crop_height)\n                img_width = max(img_width, crop_width)\n                pad_size = (img_width, img_height)\n                _pad_cfg = self.pad_cfg.copy()\n                _pad_cfg.update(dict(size=pad_size))\n                pad_transform = TRANSFORMS.build(_pad_cfg)\n                results = pad_transform(results)\n            else:\n                crop_height = min(crop_height, img_height)\n                crop_width = min(crop_width, img_width)\n\n        y1 = max(0, int(round((img_height - crop_height) / 2.)))\n        x1 = max(0, int(round((img_width - crop_width) / 2.)))\n        y2 = min(img_height, y1 + crop_height) - 1\n        x2 = min(img_width, x1 + crop_width) - 1\n        bboxes = np.array([x1, y1, x2, y2])\n\n        # crop the image\n        self._crop_img(results, bboxes)\n        # crop the gt_seg_map\n        self._crop_seg_map(results, bboxes)\n        # crop the bounding box\n        self._crop_bboxes(results, bboxes)\n        # crop the keypoints\n        self._crop_keypoints(results, bboxes)\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(crop_size = {self.crop_size}'\n        repr_str += f', auto_pad={self.auto_pad}'\n        repr_str += f', pad_cfg={self.pad_cfg}'\n        repr_str += f',clip_object_border = {self.clip_object_border})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomGrayscale(BaseTransform):\n    \"\"\"Randomly convert image to grayscale with a probability.\n\n    Required Key:\n\n    - img\n\n    Modified Key:\n\n    - img\n\n    Added Keys:\n\n    - grayscale\n    - grayscale_weights\n\n    Args:\n        prob (float): Probability that image should be converted to\n            grayscale. Defaults to 0.1.\n        keep_channels (bool): Whether keep channel number the same as\n            input. Defaults to False.\n        channel_weights (tuple): The grayscale weights of each channel,\n            and the weights will be normalized. For example, (1, 2, 1)\n            will be normalized as (0.25, 0.5, 0.25). Defaults to\n            (1., 1., 1.).\n        color_format (str): Color format set to be any of 'bgr',\n            'rgb', 'hsv'. Note: 'hsv' image will be transformed into 'bgr'\n            format no matter whether it is grayscaled. Defaults to 'bgr'.\n    \"\"\"\n\n    def __init__(self,\n                 prob: float = 0.1,\n                 keep_channels: bool = False,\n                 channel_weights: Sequence[float] = (1., 1., 1.),\n                 color_format: str = 'bgr') -> None:\n        super().__init__()\n        assert 0. <= prob <= 1., ('The range of ``prob`` value is [0., 1.],' +\n                                  f' but got {prob} instead')\n        self.prob = prob\n        self.keep_channels = keep_channels\n        self.channel_weights = channel_weights\n        assert color_format in ['bgr', 'rgb', 'hsv']\n        self.color_format = color_format\n\n    @cache_randomness\n    def _random_prob(self):\n        return random.random()\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Apply random grayscale on results.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n\n        Returns:\n           dict: Results with grayscale image.\n        \"\"\"\n        img = results['img']\n        # convert hsv to bgr\n        if self.color_format == 'hsv':\n            img = mmcv.hsv2bgr(img)\n        img = img[..., None] if img.ndim == 2 else img\n        num_output_channels = img.shape[2]\n        if self._random_prob() < self.prob:\n            if num_output_channels > 1:\n                assert num_output_channels == len(\n                    self.channel_weights\n                ), 'The length of ``channel_weights`` are supposed to be '\n                f'num_output_channels, but got {len(self.channel_weights)}'\n                ' instead.'\n                normalized_weights = (\n                    np.array(self.channel_weights) / sum(self.channel_weights))\n                img = (normalized_weights * img).sum(axis=2)\n                img = img.astype('uint8')\n                if self.keep_channels:\n                    img = img[:, :, None]\n                    results['img'] = np.dstack(\n                        [img for _ in range(num_output_channels)])\n                else:\n                    results['img'] = img\n                return results\n        img = img.astype('uint8')\n        results['img'] = img\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(prob = {self.prob}'\n        repr_str += f', keep_channels = {self.keep_channels}'\n        repr_str += f', channel_weights = {self.channel_weights}'\n        repr_str += f', color_format = {self.color_format})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass MultiScaleFlipAug(BaseTransform):\n    \"\"\"Test-time augmentation with multiple scales and flipping.\n\n    An example configuration is as followed:\n\n    .. code-block::\n\n        dict(\n            type='MultiScaleFlipAug',\n            scales=[(1333, 400), (1333, 800)],\n            flip=True,\n            transforms=[\n                dict(type='Normalize', **img_norm_cfg),\n                dict(type='Pad', size_divisor=1),\n                dict(type='ImageToTensor', keys=['img']),\n                dict(type='Collect', keys=['img'])\n            ])\n\n    ``results`` will be resized using all the sizes in ``scales``.\n    If ``flip`` is True, then flipped results will also be added into output\n    list.\n\n    For the above configuration, there are four combinations of resize\n    and flip:\n\n    - Resize to (1333, 400) + no flip\n    - Resize to (1333, 400) + flip\n    - Resize to (1333, 800) + no flip\n    - resize to (1333, 800) + flip\n\n    The four results are then transformed with ``transforms`` argument.\n    After that, results are wrapped into lists of the same length as below:\n\n    .. code-block::\n\n        dict(\n            inputs=[...],\n            data_samples=[...]\n        )\n\n    Where the length of ``inputs`` and ``data_samples`` are both 4.\n\n    Required Keys:\n\n    - Depending on the requirements of the ``transforms`` parameter.\n\n    Modified Keys:\n\n    - All output keys of each transform.\n\n    Args:\n        transforms (list[dict]): Transforms to be applied to each resized\n            and flipped data.\n        scales (tuple | list[tuple] | None): Images scales for resizing.\n        scale_factor (float or tuple[float]): Scale factors for resizing.\n            Defaults to None.\n        allow_flip (bool): Whether apply flip augmentation. Defaults to False.\n        flip_direction (str | list[str]): Flip augmentation directions,\n            options are \"horizontal\", \"vertical\" and \"diagonal\". If\n            flip_direction is a list, multiple flip augmentations will be\n            applied. It has no effect when flip == False. Defaults to\n            \"horizontal\".\n        resize_cfg (dict): Base config for resizing. Defaults to\n            ``dict(type='Resize', keep_ratio=True)``.\n        flip_cfg (dict): Base config for flipping. Defaults to\n            ``dict(type='RandomFlip')``.\n    \"\"\"\n\n    def __init__(\n        self,\n        transforms: List[dict],\n        scales: Optional[Union[Tuple, List[Tuple]]] = None,\n        scale_factor: Optional[Union[float, List[float]]] = None,\n        allow_flip: bool = False,\n        flip_direction: Union[str, List[str]] = 'horizontal',\n        resize_cfg: dict = dict(type='Resize', keep_ratio=True),\n        flip_cfg: dict = dict(type='RandomFlip')\n    ) -> None:\n        super().__init__()\n        self.transforms = Compose(transforms)  # type: ignore\n\n        if scales is not None:\n            self.scales = scales if isinstance(scales, list) else [scales]\n            self.scale_key = 'scale'\n            assert mmengine.is_list_of(self.scales, tuple)\n        else:\n            # if ``scales`` and ``scale_factor`` both be ``None``\n            if scale_factor is None:\n                self.scales = [1.]  # type: ignore\n            elif isinstance(scale_factor, list):\n                self.scales = scale_factor  # type: ignore\n            else:\n                self.scales = [scale_factor]  # type: ignore\n\n            self.scale_key = 'scale_factor'\n\n        self.allow_flip = allow_flip\n        self.flip_direction = flip_direction if isinstance(\n            flip_direction, list) else [flip_direction]\n        assert mmengine.is_list_of(self.flip_direction, str)\n        if not self.allow_flip and self.flip_direction != ['horizontal']:\n            warnings.warn(\n                'flip_direction has no effect when flip is set to False')\n        self.resize_cfg = resize_cfg.copy()\n        self.flip_cfg = flip_cfg\n\n    def transform(self, results: dict) -> Dict:\n        \"\"\"Apply test time augment transforms on results.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n\n        Returns:\n            dict: The augmented data, where each value is wrapped\n            into a list.\n        \"\"\"\n\n        data_samples = []\n        inputs = []\n        flip_args = [(False, '')]\n        if self.allow_flip:\n            flip_args += [(True, direction)\n                          for direction in self.flip_direction]\n        for scale in self.scales:\n            for flip, direction in flip_args:\n                _resize_cfg = self.resize_cfg.copy()\n                _resize_cfg.update({self.scale_key: scale})\n                _resize_flip = [_resize_cfg]\n\n                if flip:\n                    _flip_cfg = self.flip_cfg.copy()\n                    _flip_cfg.update(prob=1.0, direction=direction)\n                    _resize_flip.append(_flip_cfg)\n                else:\n                    results['flip'] = False\n                    results['flip_direction'] = None\n\n                resize_flip = Compose(_resize_flip)\n                _results = resize_flip(results.copy())\n                packed_results = self.transforms(_results)  # type: ignore\n\n                inputs.append(packed_results['inputs'])  # type: ignore\n                data_samples.append(\n                    packed_results['data_sample'])  # type: ignore\n        return dict(inputs=inputs, data_sample=data_samples)\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(transforms={self.transforms}'\n        repr_str += f', scales={self.scales}'\n        repr_str += f', allow_flip={self.allow_flip}'\n        repr_str += f', flip_direction={self.flip_direction})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass TestTimeAug(BaseTransform):\n    \"\"\"Test-time augmentation transform.\n\n    An example configuration is as followed:\n\n    .. code-block::\n\n        dict(type='TestTimeAug',\n             transforms=[\n                [dict(type='Resize', scale=(1333, 400), keep_ratio=True),\n                 dict(type='Resize', scale=(1333, 800), keep_ratio=True)],\n                [dict(type='RandomFlip', prob=1.),\n                 dict(type='RandomFlip', prob=0.)],\n                [dict(type='PackDetInputs',\n                      meta_keys=('img_id', 'img_path', 'ori_shape',\n                                 'img_shape', 'scale_factor', 'flip',\n                                 'flip_direction'))]])\n\n    ``results`` will be transformed using all transforms defined in\n    ``transforms`` arguments.\n\n    For the above configuration, there are four combinations of resize\n    and flip:\n\n    - Resize to (1333, 400) + no flip\n    - Resize to (1333, 400) + flip\n    - Resize to (1333, 800) + no flip\n    - resize to (1333, 800) + flip\n\n    After that, results are wrapped into lists of the same length as below:\n\n    .. code-block::\n\n        dict(\n            inputs=[...],\n            data_samples=[...]\n        )\n\n    The length of ``inputs`` and ``data_samples`` are both 4.\n\n    Required Keys:\n\n    - Depending on the requirements of the ``transforms`` parameter.\n\n    Modified Keys:\n\n    - All output keys of each transform.\n\n    Args:\n        transforms (list[list[dict]]): Transforms to be applied to data sampled\n            from dataset. ``transforms`` is a list of list, and each list\n            element usually represents a series of transforms with the same\n            type and different arguments. Data will be processed by each list\n            elements sequentially. See more information in :meth:`transform`.\n    \"\"\"\n\n    def __init__(self, transforms: list):\n        for i, transform_list in enumerate(transforms):\n            for j, transform in enumerate(transform_list):\n                if isinstance(transform, dict):\n                    transform_list[j] = TRANSFORMS.build(transform)\n                elif callable(transform):\n                    continue\n                else:\n                    raise TypeError(\n                        'transform must be callable or a dict, but got'\n                        f' {type(transform)}')\n            transforms[i] = transform_list\n\n        self.subroutines = [\n            Compose(subroutine) for subroutine in product(*transforms)\n        ]\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Apply all transforms defined in :attr:`transforms` to the results.\n\n        As the example given in :obj:`TestTimeAug`, ``transforms`` consists of\n        2 ``Resize``, 2 ``RandomFlip`` and 1 ``PackDetInputs``.\n        The data sampled from dataset will be processed as follows:\n\n        1. Data will be processed by 2 ``Resize`` and return a list\n           of 2 results.\n        2. Each result in list will be further passed to 2\n           ``RandomFlip``, and aggregates into a list of 4 results.\n        3. Each result will be processed by ``PackDetInputs``, and\n           return a list of dict.\n        4. Aggregates the same fields of results, and finally returns\n           a dict. Each value of the dict represents 4 transformed\n           results.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n\n        Returns:\n            dict: The augmented data, where each value is wrapped\n            into a list.\n        \"\"\"\n        results_list = []  # type: ignore\n        for subroutine in self.subroutines:\n            result = subroutine(copy.deepcopy(results))\n            assert isinstance(result, dict), (\n                f'Data processed by {subroutine} must return a dict, but got '\n                f'{result}')\n            assert result is not None, (\n                f'Data processed by {subroutine} in `TestTimeAug` should not '\n                'be None! Please check your validation dataset and the '\n                f'transforms in {subroutine}')\n            results_list.append(result)\n\n        aug_data_dict = {\n            key: [item[key] for item in results_list]  # type: ignore\n            for key in results_list[0]  # type: ignore\n        }\n        return aug_data_dict\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += 'transforms=\\n'\n        for subroutine in self.subroutines:\n            repr_str += f'{repr(subroutine)}\\n'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomChoiceResize(BaseTransform):\n    \"\"\"Resize images & bbox & mask from a list of multiple scales.\n\n    This transform resizes the input image to some scale. Bboxes and masks are\n    then resized with the same scale factor. Resize scale will be randomly\n    selected from ``scales``.\n\n    How to choose the target scale to resize the image will follow the rules\n    below:\n\n    - if `scale` is a list of tuple, the target scale is sampled from the list\n      uniformally.\n    - if `scale` is a tuple, the target scale will be set to the tuple.\n\n    Required Keys:\n\n    - img\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n    - gt_keypoints (optional)\n\n    Modified Keys:\n\n    - img\n    - img_shape\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n    - gt_keypoints (optional)\n\n    Added Keys:\n\n    - scale\n    - scale_factor\n    - scale_idx\n    - keep_ratio\n\n\n    Args:\n        scales (Union[list, Tuple]): Images scales for resizing.\n        resize_type (str): The type of resize class to use. Defaults to\n            \"Resize\".\n        **resize_kwargs: Other keyword arguments for the ``resize_type``.\n\n    Note:\n        By defaults, the ``resize_type`` is \"Resize\", if it's not overwritten\n        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,\n        ``resize_kwargs`` accepts any keyword arguments of it, like\n        ``keep_ratio``, ``interpolation`` and so on.\n\n        If you want to use your custom resize class, the class should accept\n        ``scale`` argument and have ``scale`` attribution which determines the\n        resize shape.\n    \"\"\"\n\n    def __init__(\n        self,\n        scales: Sequence[Union[int, Tuple]],\n        resize_type: str = 'Resize',\n        **resize_kwargs,\n    ) -> None:\n        super().__init__()\n        if isinstance(scales, list):\n            self.scales = scales\n        else:\n            self.scales = [scales]\n        assert mmengine.is_seq_of(self.scales, (tuple, int))\n\n        self.resize_cfg = dict(type=resize_type, **resize_kwargs)\n        # create a empty Resize object\n        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})\n\n    @cache_randomness\n    def _random_select(self) -> Tuple[int, int]:\n        \"\"\"Randomly select an scale from given candidates.\n\n        Returns:\n            (tuple, int): Returns a tuple ``(scale, scale_dix)``,\n            where ``scale`` is the selected image scale and\n            ``scale_idx`` is the selected index in the given candidates.\n        \"\"\"\n\n        scale_idx = np.random.randint(len(self.scales))\n        scale = self.scales[scale_idx]\n        return scale, scale_idx\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Apply resize transforms on results from a list of scales.\n\n        Args:\n            results (dict): Result dict contains the data to transform.\n\n        Returns:\n            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',\n            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',\n            and 'keep_ratio' keys are updated in result dict.\n        \"\"\"\n\n        target_scale, scale_idx = self._random_select()\n        self.resize.scale = target_scale\n        results = self.resize(results)\n        results['scale_idx'] = scale_idx\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(scales={self.scales}'\n        repr_str += f', resize_cfg={self.resize_cfg})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomFlip(BaseTransform):\n    \"\"\"Flip the image & bbox & keypoints & segmentation map. Added or Updated\n    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and\n    gt_keypoints. There are 3 flip modes:\n\n    - ``prob`` is float, ``direction`` is string: the image will be\n      ``direction``ly flipped with probability of ``prob`` .\n      E.g., ``prob=0.5``, ``direction='horizontal'``,\n      then image will be horizontally flipped with probability of 0.5.\n\n    - ``prob`` is float, ``direction`` is list of string: the image will\n      be ``direction[i]``ly flipped with probability of\n      ``prob/len(direction)``.\n      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,\n      then image will be horizontally flipped with probability of 0.25,\n      vertically with probability of 0.25.\n\n    - ``prob`` is list of float, ``direction`` is list of string:\n      given ``len(prob) == len(direction)``, the image will\n      be ``direction[i]``ly flipped with probability of ``prob[i]``.\n      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',\n      'vertical']``, then image will be horizontally flipped with\n      probability of 0.3, vertically with probability of 0.5.\n\n    Required Keys:\n\n    - img\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n    - gt_keypoints (optional)\n\n    Modified Keys:\n\n    - img\n    - gt_bboxes (optional)\n    - gt_seg_map (optional)\n    - gt_keypoints (optional)\n\n    Added Keys:\n\n    - flip\n    - flip_direction\n    - swap_seg_labels (optional)\n\n    Args:\n        prob (float | list[float], optional): The flipping probability.\n            Defaults to None.\n        direction(str | list[str]): The flipping direction. Options\n            If input is a list, the length must equal ``prob``. Each\n            element in ``prob`` indicates the flip probability of\n            corresponding direction. Defaults to 'horizontal'.\n        swap_seg_labels (list, optional): The label pair need to be swapped\n            for ground truth, like 'left arm' and 'right arm' need to be\n            swapped after horizontal flipping. For example, ``[(1, 5)]``,\n            where 1/5 is the label of the left/right arm. Defaults to None.\n    \"\"\"\n\n    def __init__(self,\n                 prob: Optional[Union[float, Iterable[float]]] = None,\n                 direction: Union[str, Sequence[Optional[str]]] = 'horizontal',\n                 swap_seg_labels: Optional[Sequence] = None) -> None:\n        if isinstance(prob, list):\n            assert mmengine.is_list_of(prob, float)\n            assert 0 <= sum(prob) <= 1\n        elif isinstance(prob, float):\n            assert 0 <= prob <= 1\n        else:\n            raise ValueError(f'probs must be float or list of float, but \\\n                              got `{type(prob)}`.')\n        self.prob = prob\n        self.swap_seg_labels = swap_seg_labels\n\n        valid_directions = ['horizontal', 'vertical', 'diagonal']\n        if isinstance(direction, str):\n            assert direction in valid_directions\n        elif isinstance(direction, list):\n            assert mmengine.is_list_of(direction, str)\n            assert set(direction).issubset(set(valid_directions))\n        else:\n            raise ValueError(f'direction must be either str or list of str, \\\n                               but got `{type(direction)}`.')\n        self.direction = direction\n\n        if isinstance(prob, list):\n            assert len(prob) == len(self.direction)\n\n    def _flip_bbox(self, bboxes: np.ndarray, img_shape: Tuple[int, int],\n                   direction: str) -> np.ndarray:\n        \"\"\"Flip bboxes horizontally.\n\n        Args:\n            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)\n            img_shape (tuple[int]): Image shape (height, width)\n            direction (str): Flip direction. Options are 'horizontal',\n                'vertical', and 'diagonal'.\n\n        Returns:\n            numpy.ndarray: Flipped bounding boxes.\n        \"\"\"\n        assert bboxes.shape[-1] % 4 == 0\n        flipped = bboxes.copy()\n        h, w = img_shape\n        if direction == 'horizontal':\n            flipped[..., 0::4] = w - bboxes[..., 2::4]\n            flipped[..., 2::4] = w - bboxes[..., 0::4]\n        elif direction == 'vertical':\n            flipped[..., 1::4] = h - bboxes[..., 3::4]\n            flipped[..., 3::4] = h - bboxes[..., 1::4]\n        elif direction == 'diagonal':\n            flipped[..., 0::4] = w - bboxes[..., 2::4]\n            flipped[..., 1::4] = h - bboxes[..., 3::4]\n            flipped[..., 2::4] = w - bboxes[..., 0::4]\n            flipped[..., 3::4] = h - bboxes[..., 1::4]\n        else:\n            raise ValueError(\n                f\"Flipping direction must be 'horizontal', 'vertical', \\\n                  or 'diagonal', but got '{direction}'\")\n        return flipped\n\n    def _flip_keypoints(\n        self,\n        keypoints: np.ndarray,\n        img_shape: Tuple[int, int],\n        direction: str,\n    ) -> np.ndarray:\n        \"\"\"Flip keypoints horizontally, vertically or diagonally.\n\n        Args:\n            keypoints (numpy.ndarray): Keypoints, shape (..., 2)\n            img_shape (tuple[int]): Image shape (height, width)\n            direction (str): Flip direction. Options are 'horizontal',\n                'vertical', and 'diagonal'.\n\n        Returns:\n            numpy.ndarray: Flipped keypoints.\n        \"\"\"\n\n        meta_info = keypoints[..., 2:]\n        keypoints = keypoints[..., :2]\n        flipped = keypoints.copy()\n        h, w = img_shape\n        if direction == 'horizontal':\n            flipped[..., 0::2] = w - keypoints[..., 0::2]\n        elif direction == 'vertical':\n            flipped[..., 1::2] = h - keypoints[..., 1::2]\n        elif direction == 'diagonal':\n            flipped[..., 0::2] = w - keypoints[..., 0::2]\n            flipped[..., 1::2] = h - keypoints[..., 1::2]\n        else:\n            raise ValueError(\n                f\"Flipping direction must be 'horizontal', 'vertical', \\\n                  or 'diagonal', but got '{direction}'\")\n        flipped = np.concatenate([flipped, meta_info], axis=-1)\n        return flipped\n\n    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:\n        \"\"\"Flip segmentation map horizontally, vertically or diagonally.\n\n        Args:\n            seg_map (numpy.ndarray): segmentation map, shape (H, W).\n            direction (str): Flip direction. Options are 'horizontal',\n                'vertical'.\n\n        Returns:\n            numpy.ndarray: Flipped segmentation map.\n        \"\"\"\n        seg_map = mmcv.imflip(seg_map, direction=direction)\n        if self.swap_seg_labels is not None:\n            # to handle datasets with left/right annotations\n            # like 'Left-arm' and 'Right-arm' in LIP dataset\n            # Modified from https://github.com/openseg-group/openseg.pytorch/blob/master/lib/datasets/tools/cv2_aug_transforms.py # noqa:E501\n            # Licensed under MIT license\n            temp = seg_map.copy()\n            assert isinstance(self.swap_seg_labels, (tuple, list))\n            for pair in self.swap_seg_labels:\n                assert isinstance(pair, (tuple, list)) and len(pair) == 2, \\\n                    'swap_seg_labels must be a sequence with pair, but got ' \\\n                    f'{self.swap_seg_labels}.'\n                seg_map[temp == pair[0]] = pair[1]\n                seg_map[temp == pair[1]] = pair[0]\n        return seg_map\n\n    @cache_randomness\n    def _choose_direction(self) -> str:\n        \"\"\"Choose the flip direction according to `prob` and `direction`\"\"\"\n        if isinstance(self.direction,\n                      Sequence) and not isinstance(self.direction, str):\n            # None means non-flip\n            direction_list: list = list(self.direction) + [None]\n        elif isinstance(self.direction, str):\n            # None means non-flip\n            direction_list = [self.direction, None]\n\n        if isinstance(self.prob, list):\n            non_prob: float = 1 - sum(self.prob)\n            prob_list = self.prob + [non_prob]\n        elif isinstance(self.prob, float):\n            non_prob = 1. - self.prob\n            # exclude non-flip\n            single_ratio = self.prob / (len(direction_list) - 1)\n            prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob]\n\n        cur_dir = np.random.choice(direction_list, p=prob_list)\n\n        return cur_dir\n\n    def _flip(self, results: dict) -> None:\n        \"\"\"Flip images, bounding boxes, semantic segmentation map and\n        keypoints.\"\"\"\n        # flip image\n        results['img'] = mmcv.imflip(\n            results['img'], direction=results['flip_direction'])\n\n        img_shape = results['img'].shape[:2]\n\n        # flip bboxes\n        if results.get('gt_bboxes', None) is not None:\n            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],\n                                                   img_shape,\n                                                   results['flip_direction'])\n\n        # flip keypoints\n        if results.get('gt_keypoints', None) is not None:\n            results['gt_keypoints'] = self._flip_keypoints(\n                results['gt_keypoints'], img_shape, results['flip_direction'])\n\n        # flip seg map\n        if results.get('gt_seg_map', None) is not None:\n            results['gt_seg_map'] = self._flip_seg_map(\n                results['gt_seg_map'], direction=results['flip_direction'])\n            results['swap_seg_labels'] = self.swap_seg_labels\n\n    def _flip_on_direction(self, results: dict) -> None:\n        \"\"\"Function to flip images, bounding boxes, semantic segmentation map\n        and keypoints.\"\"\"\n        cur_dir = self._choose_direction()\n        if cur_dir is None:\n            results['flip'] = False\n            results['flip_direction'] = None\n        else:\n            results['flip'] = True\n            results['flip_direction'] = cur_dir\n            self._flip(results)\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Transform function to flip images, bounding boxes, semantic\n        segmentation map and keypoints.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Flipped results, 'img', 'gt_bboxes', 'gt_seg_map',\n            'gt_keypoints', 'flip', and 'flip_direction' keys are\n            updated in result dict.\n        \"\"\"\n        self._flip_on_direction(results)\n\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(prob={self.prob}, '\n        repr_str += f'direction={self.direction})'\n\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomResize(BaseTransform):\n    \"\"\"Random resize images & bbox & keypoints.\n\n    How to choose the target scale to resize the image will follow the rules\n    below:\n\n    - if ``scale`` is a sequence of tuple\n\n    .. math::\n        target\\\\_scale[0] \\\\sim Uniform([scale[0][0], scale[1][0]])\n    .. math::\n        target\\\\_scale[1] \\\\sim Uniform([scale[0][1], scale[1][1]])\n\n    Following the resize order of weight and height in cv2, ``scale[i][0]``\n    is for width, and ``scale[i][1]`` is for height.\n\n    - if ``scale`` is a tuple\n\n    .. math::\n        target\\\\_scale[0] \\\\sim Uniform([ratio\\\\_range[0], ratio\\\\_range[1]])\n            * scale[0]\n    .. math::\n        target\\\\_scale[1] \\\\sim Uniform([ratio\\\\_range[0], ratio\\\\_range[1]])\n            * scale[1]\n\n    Following the resize order of weight and height in cv2, ``ratio_range[0]``\n    is for width, and ``ratio_range[1]`` is for height.\n\n    - if ``keep_ratio`` is True, the minimum value of ``target_scale`` will be\n      used to set the shorter side and the maximum value will be used to\n      set the longer side.\n\n    - if ``keep_ratio`` is False, the value of ``target_scale`` will be used to\n      reisze the width and height accordingly.\n\n    Required Keys:\n\n    - img\n    - gt_bboxes\n    - gt_seg_map\n    - gt_keypoints\n\n    Modified Keys:\n\n    - img\n    - gt_bboxes\n    - gt_seg_map\n    - gt_keypoints\n    - img_shape\n\n    Added Keys:\n\n    - scale\n    - scale_factor\n    - keep_ratio\n\n    Args:\n        scale (tuple or Sequence[tuple]): Images scales for resizing.\n            Defaults to None.\n        ratio_range (tuple[float], optional): (min_ratio, max_ratio).\n            Defaults to None.\n        resize_type (str): The type of resize class to use. Defaults to\n            \"Resize\".\n        **resize_kwargs: Other keyword arguments for the ``resize_type``.\n\n    Note:\n        By defaults, the ``resize_type`` is \"Resize\", if it's not overwritten\n        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,\n        ``resize_kwargs`` accepts any keyword arguments of it, like\n        ``keep_ratio``, ``interpolation`` and so on.\n\n        If you want to use your custom resize class, the class should accept\n        ``scale`` argument and have ``scale`` attribution which determines the\n        resize shape.\n    \"\"\"\n\n    def __init__(\n        self,\n        scale: Union[Tuple[int, int], Sequence[Tuple[int, int]]],\n        ratio_range: Optional[Tuple[float, float]] = None,\n        resize_type: str = 'Resize',\n        **resize_kwargs,\n    ) -> None:\n\n        self.scale = scale\n        self.ratio_range = ratio_range\n\n        self.resize_cfg = dict(type=resize_type, **resize_kwargs)\n        # create a empty Reisize object\n        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})\n\n    @staticmethod\n    def _random_sample(scales: Sequence[Tuple[int, int]]) -> tuple:\n        \"\"\"Private function to randomly sample a scale from a list of tuples.\n\n        Args:\n            scales (list[tuple]): Images scale range for sampling.\n                There must be two tuples in scales, which specify the lower\n                and upper bound of image scales.\n\n        Returns:\n            tuple: The targeted scale of the image to be resized.\n        \"\"\"\n\n        assert mmengine.is_list_of(scales, tuple) and len(scales) == 2\n        scale_0 = [scales[0][0], scales[1][0]]\n        scale_1 = [scales[0][1], scales[1][1]]\n        edge_0 = np.random.randint(min(scale_0), max(scale_0) + 1)\n        edge_1 = np.random.randint(min(scale_1), max(scale_1) + 1)\n        scale = (edge_0, edge_1)\n        return scale\n\n    @staticmethod\n    def _random_sample_ratio(scale: tuple, ratio_range: Tuple[float,\n                                                              float]) -> tuple:\n        \"\"\"Private function to randomly sample a scale from a tuple.\n\n        A ratio will be randomly sampled from the range specified by\n        ``ratio_range``. Then it would be multiplied with ``scale`` to\n        generate sampled scale.\n\n        Args:\n            scale (tuple): Images scale base to multiply with ratio.\n            ratio_range (tuple[float]): The minimum and maximum ratio to scale\n                the ``scale``.\n\n        Returns:\n            tuple: The targeted scale of the image to be resized.\n        \"\"\"\n\n        assert isinstance(scale, tuple) and len(scale) == 2\n        min_ratio, max_ratio = ratio_range\n        assert min_ratio <= max_ratio\n        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio\n        scale = int(scale[0] * ratio), int(scale[1] * ratio)\n        return scale\n\n    @cache_randomness\n    def _random_scale(self) -> tuple:\n        \"\"\"Private function to randomly sample an scale according to the type\n        of ``scale``.\n\n        Returns:\n            tuple: The targeted scale of the image to be resized.\n        \"\"\"\n\n        if mmengine.is_tuple_of(self.scale, int):\n            assert self.ratio_range is not None and len(self.ratio_range) == 2\n            scale = self._random_sample_ratio(\n                self.scale,  # type: ignore\n                self.ratio_range)\n        elif mmengine.is_seq_of(self.scale, tuple):\n            scale = self._random_sample(self.scale)  # type: ignore\n        else:\n            raise NotImplementedError('Do not support sampling function '\n                                      f'for \"{self.scale}\"')\n\n        return scale\n\n    def transform(self, results: dict) -> dict:\n        \"\"\"Transform function to resize images, bounding boxes, semantic\n        segmentation map.\n\n        Args:\n            results (dict): Result dict from loading pipeline.\n\n        Returns:\n            dict: Resized results, ``img``, ``gt_bboxes``, ``gt_semantic_seg``,\n            ``gt_keypoints``, ``scale``, ``scale_factor``, ``img_shape``, and\n            ``keep_ratio`` keys are updated in result dict.\n        \"\"\"\n        results['scale'] = self._random_scale()\n        self.resize.scale = results['scale']\n        results = self.resize(results)\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(scale={self.scale}, '\n        repr_str += f'ratio_range={self.ratio_range}, '\n        repr_str += f'resize_cfg={self.resize_cfg})'\n        return repr_str\n"
  },
  {
    "path": "mmcv/transforms/utils.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\nimport copy\nimport functools\nimport inspect\nimport weakref\nfrom collections import defaultdict\nfrom collections.abc import Iterable\nfrom contextlib import contextmanager\nfrom typing import Callable, Union\n\nfrom .base import BaseTransform\n\n\nclass cache_randomness:\n    \"\"\"Decorator that marks the method with random return value(s) in a\n    transform class.\n\n    This decorator is usually used together with the context-manager\n    :func`:cache_random_params`. In this context, a decorated method will\n    cache its return value(s) at the first time of being invoked, and always\n    return the cached values when being invoked again.\n\n    .. note::\n        Only an instance method can be decorated with ``cache_randomness``.\n    \"\"\"\n\n    def __init__(self, func):\n\n        # Check `func` is to be bound as an instance method\n        if not inspect.isfunction(func):\n            raise TypeError('Unsupport callable to decorate with'\n                            '@cache_randomness.')\n        func_args = inspect.getfullargspec(func).args\n        if len(func_args) == 0 or func_args[0] != 'self':\n            raise TypeError(\n                '@cache_randomness should only be used to decorate '\n                'instance methods (the first argument is ``self``).')\n\n        functools.update_wrapper(self, func)\n        self.func = func\n        self.instance_ref = None\n\n    def __set_name__(self, owner, name):\n        # Maintain a record of decorated methods in the class\n        if not hasattr(owner, '_methods_with_randomness'):\n            setattr(owner, '_methods_with_randomness', [])\n\n        # Here `name` equals to `self.__name__`, i.e., the name of the\n        # decorated function, due to the invocation of `update_wrapper` in\n        # `self.__init__()`\n        owner._methods_with_randomness.append(name)\n\n    def __call__(self, *args, **kwargs):\n        # Get the transform instance whose method is decorated\n        # by cache_randomness\n        instance = self.instance_ref()\n        name = self.__name__\n\n        # Check the flag ``self._cache_enabled``, which should be\n        # set by the contextmanagers like ``cache_random_parameters```\n        cache_enabled = getattr(instance, '_cache_enabled', False)\n\n        if cache_enabled:\n            # Initialize the cache of the transform instances. The flag\n            # ``cache_enabled``` is set by contextmanagers like\n            # ``cache_random_params```.\n            if not hasattr(instance, '_cache'):\n                setattr(instance, '_cache', {})\n\n            if name not in instance._cache:\n                instance._cache[name] = self.func(instance, *args, **kwargs)\n            # Return the cached value\n            return instance._cache[name]\n        else:\n            # Clear cache\n            if hasattr(instance, '_cache'):\n                del instance._cache\n            # Return function output\n            return self.func(instance, *args, **kwargs)\n\n    def __get__(self, obj, cls):\n        self.instance_ref = weakref.ref(obj)\n        # Return a copy to avoid multiple transform instances sharing\n        # one `cache_randomness` instance, which may cause data races\n        # in multithreading cases.\n        return copy.copy(self)\n\n\ndef avoid_cache_randomness(cls):\n    \"\"\"Decorator that marks a data transform class (subclass of\n    :class:`BaseTransform`) prohibited from caching randomness. With this\n    decorator, errors will be raised in following cases:\n\n        1. A method is defined in the class with the decorate\n    `cache_randomness`;\n        2. An instance of the class is invoked with the context\n    `cache_random_params`.\n\n    A typical usage of `avoid_cache_randomness` is to decorate the data\n    transforms with non-cacheable random behaviors (e.g., the random behavior\n    can not be defined in a method, thus can not be decorated with\n    `cache_randomness`). This is for preventing unintentinoal use of such data\n    transforms within the context of caching randomness, which may lead to\n    unexpected results.\n    \"\"\"\n\n    # Check that cls is a data transform class\n    assert issubclass(cls, BaseTransform)\n\n    # Check that no method is decorated with `cache_randomness` in cls\n    if getattr(cls, '_methods_with_randomness', None):\n        raise RuntimeError(\n            f'Class {cls.__name__} decorated with '\n            '``avoid_cache_randomness`` should not have methods decorated '\n            'with ``cache_randomness`` (invalid methods: '\n            f'{cls._methods_with_randomness})')\n\n    class AvoidCacheRandomness:\n\n        def __get__(self, obj, objtype=None):\n            # Here we check the value in `objtype.__dict__` instead of\n            # directly checking the attribute\n            # `objtype._avoid_cache_randomness`. So if the base class is\n            # decorated with :func:`avoid_cache_randomness`, it will not be\n            # inherited by subclasses.\n            return objtype.__dict__.get('_avoid_cache_randomness', False)\n\n    cls.avoid_cache_randomness = AvoidCacheRandomness()\n    cls._avoid_cache_randomness = True\n\n    return cls\n\n\n@contextmanager\ndef cache_random_params(transforms: Union[BaseTransform, Iterable]):\n    \"\"\"Context-manager that enables the cache of return values of methods\n    decorated with ``cache_randomness`` in transforms.\n\n    In this mode, decorated methods will cache their return values on the\n    first invoking, and always return the cached value afterward. This allow\n    to apply random transforms in a deterministic way. For example, apply same\n    transforms on multiple examples. See ``cache_randomness`` for more\n    information.\n\n    Args:\n        transforms (BaseTransform|list[BaseTransform]): The transforms to\n            enable cache.\n    \"\"\"\n\n    # key2method stores the original methods that are replaced by the wrapped\n    # ones. These methods will be restituted when exiting the context.\n    key2method = dict()\n\n    # key2counter stores the usage number of each cache_randomness. This is\n    # used to check that any cache_randomness is invoked once during processing\n    # on data sample.\n    key2counter: dict = defaultdict(int)\n\n    def _add_invoke_counter(obj, method_name):\n        method = getattr(obj, method_name)\n        key = f'{id(obj)}.{method_name}'\n        key2method[key] = method\n\n        @functools.wraps(method)\n        def wrapped(*args, **kwargs):\n            key2counter[key] += 1\n            return method(*args, **kwargs)\n\n        return wrapped\n\n    def _add_invoke_checker(obj, method_name):\n        # check that the method in _methods_with_randomness has been\n        # invoked at most once\n        method = getattr(obj, method_name)\n        key = f'{id(obj)}.{method_name}'\n        key2method[key] = method\n\n        @functools.wraps(method)\n        def wrapped(*args, **kwargs):\n            # clear counter\n            for name in obj._methods_with_randomness:\n                key = f'{id(obj)}.{name}'\n                key2counter[key] = 0\n\n            output = method(*args, **kwargs)\n\n            for name in obj._methods_with_randomness:\n                key = f'{id(obj)}.{name}'\n                if key2counter[key] > 1:\n                    raise RuntimeError(\n                        'The method decorated with ``cache_randomness`` '\n                        'should be invoked at most once during processing '\n                        f'one data sample. The method {name} of {obj} has '\n                        f'been invoked {key2counter[key]} times.')\n            return output\n\n        return wrapped\n\n    def _start_cache(t: BaseTransform):\n        # Check if cache is allowed for `t`\n        if getattr(t, 'avoid_cache_randomness', False):\n            raise RuntimeError(\n                f'Class {t.__class__.__name__} decorated with '\n                '``avoid_cache_randomness`` is not allowed to be used with'\n                ' ``cache_random_params`` (e.g. wrapped by '\n                '``ApplyToMultiple`` with ``share_random_params==True``).')\n\n        # Skip transforms w/o random method\n        if not hasattr(t, '_methods_with_randomness'):\n            return\n\n        # Set cache enabled flag\n        setattr(t, '_cache_enabled', True)\n\n        # Store the original method and init the counter\n        if hasattr(t, '_methods_with_randomness'):\n            setattr(t, 'transform', _add_invoke_checker(t, 'transform'))\n            for name in getattr(t, '_methods_with_randomness'):\n                setattr(t, name, _add_invoke_counter(t, name))\n\n    def _end_cache(t: BaseTransform):\n        # Skip transforms w/o random method\n        if not hasattr(t, '_methods_with_randomness'):\n            return\n\n        # Remove cache enabled flag\n        delattr(t, '_cache_enabled')\n        if hasattr(t, '_cache'):\n            delattr(t, '_cache')\n\n        # Restore the original method\n        if hasattr(t, '_methods_with_randomness'):\n            for name in getattr(t, '_methods_with_randomness'):\n                key = f'{id(t)}.{name}'\n                setattr(t, name, key2method[key])\n\n            key_transform = f'{id(t)}.transform'\n            setattr(t, 'transform', key2method[key_transform])\n\n    def _apply(t: Union[BaseTransform, Iterable],\n               func: Callable[[BaseTransform], None]):\n        if isinstance(t, BaseTransform):\n            func(t)\n        if isinstance(t, Iterable):\n            for _t in t:\n                _apply(_t, func)\n\n    try:\n        _apply(transforms, _start_cache)\n        yield\n    finally:\n        _apply(transforms, _end_cache)\n"
  },
  {
    "path": "mmcv/transforms/wrappers.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\nfrom typing import Any, Callable, Dict, List, Optional, Sequence, Union\n\nimport mmengine\nimport numpy as np\n\nfrom .base import BaseTransform\nfrom .builder import TRANSFORMS\nfrom .utils import cache_random_params, cache_randomness\n\n# Define type of transform or transform config\nTransform = Union[Dict, Callable[[Dict], Dict]]\n\n# Indicator of keys marked by KeyMapper._map_input, which means ignoring the\n# marked keys in KeyMapper._apply_transform so they will be invisible to\n# wrapped transforms.\n# This can be 2 possible case:\n# 1. The key is required but missing in results\n# 2. The key is manually set as ... (Ellipsis) in ``mapping``, which means\n# the original value in results should be ignored\nIgnoreKey = object()\n\n# Import nullcontext if python>=3.7, otherwise use a simple alternative\n# implementation.\ntry:\n    from contextlib import nullcontext  # type: ignore\nexcept ImportError:\n    from contextlib import contextmanager\n\n    @contextmanager  # type: ignore\n    def nullcontext(resource=None):\n        try:\n            yield resource\n        finally:\n            pass\n\n\n@TRANSFORMS.register_module()\nclass Compose(BaseTransform):\n    \"\"\"Compose multiple transforms sequentially.\n\n    Args:\n        transforms (list[dict | callable]): Sequence of transform object or\n            config dict to be composed.\n\n    Examples:\n        >>> pipeline = [\n        >>>     dict(type='Compose',\n        >>>         transforms=[\n        >>>             dict(type='LoadImageFromFile'),\n        >>>             dict(type='Normalize')\n        >>>         ]\n        >>>     )\n        >>> ]\n    \"\"\"\n\n    def __init__(self, transforms: Union[Transform, Sequence[Transform]]):\n        super().__init__()\n\n        if not isinstance(transforms, Sequence):\n            transforms = [transforms]\n        self.transforms: List = []\n        for transform in transforms:\n            if isinstance(transform, dict):\n                transform = TRANSFORMS.build(transform)\n                self.transforms.append(transform)\n            elif callable(transform):\n                self.transforms.append(transform)\n            else:\n                raise TypeError('transform must be callable or a dict, but got'\n                                f' {type(transform)}')\n\n    def __iter__(self):\n        \"\"\"Allow easy iteration over the transform sequence.\"\"\"\n        return iter(self.transforms)\n\n    def transform(self, results: Dict) -> Optional[Dict]:\n        \"\"\"Call function to apply transforms sequentially.\n\n        Args:\n            results (dict): A result dict contains the results to transform.\n\n        Returns:\n            dict or None: Transformed results.\n        \"\"\"\n        for t in self.transforms:\n            results = t(results)  # type: ignore\n            if results is None:\n                return None\n        return results\n\n    def __repr__(self):\n        \"\"\"Compute the string representation.\"\"\"\n        format_string = self.__class__.__name__ + '('\n        for t in self.transforms:\n            format_string += f'\\n    {t}'\n        format_string += '\\n)'\n        return format_string\n\n\n@TRANSFORMS.register_module()\nclass KeyMapper(BaseTransform):\n    \"\"\"A transform wrapper to map and reorganize the input/output of the\n    wrapped transforms (or sub-pipeline).\n\n    Args:\n        transforms (list[dict | callable], optional): Sequence of transform\n            object or config dict to be wrapped.\n        mapping (dict): A dict that defines the input key mapping.\n            The keys corresponds to the inner key (i.e., kwargs of the\n            ``transform`` method), and should be string type. The values\n            corresponds to the outer keys (i.e., the keys of the\n            data/results), and should have a type of string, list or dict.\n            None means not applying input mapping. Default: None.\n        remapping (dict): A dict that defines the output key mapping.\n            The keys and values have the same meanings and rules as in the\n            ``mapping``. Default: None.\n        auto_remap (bool, optional): If True, an inverse of the mapping will\n            be used as the remapping. If auto_remap is not given, it will be\n            automatically set True if 'remapping' is not given, and vice\n            versa. Default: None.\n        allow_nonexist_keys (bool): If False, the outer keys in the mapping\n            must exist in the input data, or an exception will be raised.\n            Default: False.\n\n    Examples:\n        >>> # Example 1: KeyMapper 'gt_img' to 'img'\n        >>> pipeline = [\n        >>>     # Use KeyMapper to convert outer (original) field name\n        >>>     # 'gt_img' to inner (used by inner transforms) filed name\n        >>>     # 'img'\n        >>>     dict(type='KeyMapper',\n        >>>         mapping={'img': 'gt_img'},\n        >>>         # auto_remap=True means output key mapping is the revert of\n        >>>         # the input key mapping, e.g. inner 'img' will be mapped\n        >>>         # back to outer 'gt_img'\n        >>>         auto_remap=True,\n        >>>         transforms=[\n        >>>             # In all transforms' implementation just use 'img'\n        >>>             # as a standard field name\n        >>>             dict(type='Crop', crop_size=(384, 384)),\n        >>>             dict(type='Normalize'),\n        >>>         ])\n        >>> ]\n\n        >>> # Example 2: Collect and structure multiple items\n        >>> pipeline = [\n        >>>     # The inner field 'imgs' will be a dict with keys 'img_src'\n        >>>     # and 'img_tar', whose values are outer fields 'img1' and\n        >>>     # 'img2' respectively.\n        >>>     dict(type='KeyMapper',\n        >>>         dict(\n        >>>             type='KeyMapper',\n        >>>             mapping=dict(\n        >>>                 imgs=dict(\n        >>>                     img_src='img1',\n        >>>                     img_tar='img2')),\n        >>>         transforms=...)\n        >>> ]\n\n        >>> # Example 3: Manually set ignored keys by \"...\"\n        >>> pipeline = [\n        >>>     ...\n        >>>     dict(type='KeyMapper',\n        >>>         mapping={\n        >>>             # map outer key \"gt_img\" to inner key \"img\"\n        >>>             'img': 'gt_img',\n        >>>             # ignore outer key \"mask\"\n        >>>             'mask': ...,\n        >>>         },\n        >>>         transforms=[\n        >>>             dict(type='RandomFlip'),\n        >>>         ])\n        >>>     ...\n        >>> ]\n    \"\"\"\n\n    def __init__(self,\n                 transforms: Union[Transform, List[Transform], None] = None,\n                 mapping: Optional[Dict] = None,\n                 remapping: Optional[Dict] = None,\n                 auto_remap: Optional[bool] = None,\n                 allow_nonexist_keys: bool = False):\n\n        super().__init__()\n\n        self.allow_nonexist_keys = allow_nonexist_keys\n        self.mapping = mapping\n\n        if auto_remap is None:\n            auto_remap = remapping is None\n        self.auto_remap = auto_remap\n\n        if self.auto_remap:\n            if remapping is not None:\n                raise ValueError('KeyMapper: ``remapping`` must be None if'\n                                 '`auto_remap` is set True.')\n            self.remapping = mapping\n        else:\n            self.remapping = remapping\n\n        if transforms is None:\n            transforms = []\n        self.transforms = Compose(transforms)\n\n    def __iter__(self):\n        \"\"\"Allow easy iteration over the transform sequence.\"\"\"\n        return iter(self.transforms)\n\n    def _map_input(self, data: Dict,\n                   mapping: Optional[Dict]) -> Dict[str, Any]:\n        \"\"\"KeyMapper inputs for the wrapped transforms by gathering and\n        renaming data items according to the mapping.\n\n        Args:\n            data (dict): The original input data\n            mapping (dict, optional): The input key mapping. See the document\n                of ``mmcv.transforms.wrappers.KeyMapper`` for details. In\n                set None, return the input data directly.\n\n        Returns:\n            dict: The input data with remapped keys. This will be the actual\n                input of the wrapped pipeline.\n        \"\"\"\n\n        if mapping is None:\n            return data.copy()\n\n        def _map(data, m):\n            if isinstance(m, dict):\n                # m is a dict {inner_key:outer_key, ...}\n                return {k_in: _map(data, k_out) for k_in, k_out in m.items()}\n            if isinstance(m, (tuple, list)):\n                # m is a list or tuple [outer_key1, outer_key2, ...]\n                # This is the case when we collect items from the original\n                # data to form a list or tuple to feed to the wrapped\n                # transforms.\n                return m.__class__(_map(data, e) for e in m)\n\n            # allow manually mark a key to be ignored by ...\n            if m is ...:\n                return IgnoreKey\n\n            # m is an outer_key\n            if self.allow_nonexist_keys:\n                return data.get(m, IgnoreKey)\n            else:\n                return data.get(m)\n\n        collected = _map(data, mapping)\n\n        # Retain unmapped items\n        inputs = data.copy()\n        inputs.update(collected)\n\n        return inputs\n\n    def _map_output(self, data: Dict,\n                    remapping: Optional[Dict]) -> Dict[str, Any]:\n        \"\"\"KeyMapper outputs from the wrapped transforms by gathering and\n        renaming data items according to the remapping.\n\n        Args:\n            data (dict): The output of the wrapped pipeline.\n            remapping (dict, optional): The output key mapping. See the\n                document of ``mmcv.transforms.wrappers.KeyMapper`` for\n                details. If ``remapping is None``, no key mapping will be\n                applied but only remove the special token ``IgnoreKey``.\n\n        Returns:\n            dict: The output with remapped keys.\n        \"\"\"\n\n        # Remove ``IgnoreKey``\n        if remapping is None:\n            return {k: v for k, v in data.items() if v is not IgnoreKey}\n\n        def _map(data, m):\n            if isinstance(m, dict):\n                assert isinstance(data, dict)\n                results = {}\n                for k_in, k_out in m.items():\n                    assert k_in in data\n                    results.update(_map(data[k_in], k_out))\n                return results\n            if isinstance(m, (list, tuple)):\n                assert isinstance(data, (list, tuple))\n                assert len(data) == len(m)\n                results = {}\n                for m_i, d_i in zip(m, data):\n                    results.update(_map(d_i, m_i))\n                return results\n\n            # ``m is ...`` means the key is marked ignored, in which case the\n            # inner resuls will not affect the outer results in remapping.\n            # Another case that will have ``data is IgnoreKey`` is that the\n            # key is missing in the inputs. In this case, if the inner key is\n            # created by the wrapped transforms, it will be remapped to the\n            # corresponding outer key during remapping.\n            if m is ... or data is IgnoreKey:\n                return {}\n\n            return {m: data}\n\n        # Note that unmapped items are not retained, which is different from\n        # the behavior in _map_input. This is to avoid original data items\n        # being overwritten by intermediate namesakes\n        return _map(data, remapping)\n\n    def _apply_transforms(self, inputs: Dict) -> Dict:\n        \"\"\"Apply ``self.transforms``.\n\n        Note that the special token ``IgnoreKey`` will be invisible to\n        ``self.transforms``, but not removed in this method. It will be\n        eventually removed in :func:``self._map_output``.\n        \"\"\"\n        results = inputs.copy()\n        inputs = {k: v for k, v in inputs.items() if v is not IgnoreKey}\n        outputs = self.transforms(inputs)\n\n        if outputs is None:\n            raise ValueError(\n                f'Transforms wrapped by {self.__class__.__name__} should '\n                'not return None.')\n\n        results.update(outputs)  # type: ignore\n        return results\n\n    def transform(self, results: Dict) -> Dict:\n        \"\"\"Apply mapping, wrapped transforms and remapping.\"\"\"\n\n        # Apply mapping\n        inputs = self._map_input(results, self.mapping)\n        # Apply wrapped transforms\n        outputs = self._apply_transforms(inputs)\n        # Apply remapping\n        outputs = self._map_output(outputs, self.remapping)\n\n        results.update(outputs)  # type: ignore\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(transforms = {self.transforms}'\n        repr_str += f', mapping = {self.mapping}'\n        repr_str += f', remapping = {self.remapping}'\n        repr_str += f', auto_remap = {self.auto_remap}'\n        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass TransformBroadcaster(KeyMapper):\n    \"\"\"A transform wrapper to apply the wrapped transforms to multiple data\n    items. For example, apply Resize to multiple images.\n\n    Args:\n        transforms (list[dict | callable]): Sequence of transform object or\n            config dict to be wrapped.\n        mapping (dict): A dict that defines the input key mapping.\n            Note that to apply the transforms to multiple data items, the\n            outer keys of the target items should be remapped as a list with\n            the standard inner key (The key required by the wrapped transform).\n            See the following example and the document of\n            ``mmcv.transforms.wrappers.KeyMapper`` for details.\n        remapping (dict): A dict that defines the output key mapping.\n            The keys and values have the same meanings and rules as in the\n            ``mapping``. Default: None.\n        auto_remap (bool, optional): If True, an inverse of the mapping will\n            be used as the remapping. If auto_remap is not given, it will be\n            automatically set True if 'remapping' is not given, and vice\n            versa. Default: None.\n        allow_nonexist_keys (bool): If False, the outer keys in the mapping\n            must exist in the input data, or an exception will be raised.\n            Default: False.\n        share_random_params (bool): If True, the random transform\n            (e.g., RandomFlip) will be conducted in a deterministic way and\n            have the same behavior on all data items. For example, to randomly\n            flip either both input image and ground-truth image, or none.\n            Default: False.\n\n    .. note::\n        To apply the transforms to each elements of a list or tuple, instead\n        of separating data items, you can map the outer key of the target\n        sequence to the standard inner key. See example 2.\n        example.\n\n    Examples:\n        >>> # Example 1: Broadcast to enumerated keys, each contains a single\n        >>> # data element\n        >>> pipeline = [\n        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img\n        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img\n        >>>     # TransformBroadcaster maps multiple outer fields to standard\n        >>>     # the inner field and process them with wrapped transforms\n        >>>     # respectively\n        >>>     dict(type='TransformBroadcaster',\n        >>>         # case 1: from multiple outer fields\n        >>>         mapping={'img': ['lq', 'gt']},\n        >>>         auto_remap=True,\n        >>>         # share_random_param=True means using identical random\n        >>>         # parameters in every processing\n        >>>         share_random_param=True,\n        >>>         transforms=[\n        >>>             dict(type='Crop', crop_size=(384, 384)),\n        >>>             dict(type='Normalize'),\n        >>>         ])\n        >>> ]\n\n        >>> # Example 2: Broadcast to keys that contains data sequences\n        >>> pipeline = [\n        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img\n        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img\n        >>>     # TransformBroadcaster maps multiple outer fields to standard\n        >>>     # the inner field and process them with wrapped transforms\n        >>>     # respectively\n        >>>     dict(type='TransformBroadcaster',\n        >>>         # case 2: from one outer field that contains multiple\n        >>>         # data elements (e.g. a list)\n        >>>         # mapping={'img': 'images'},\n        >>>         auto_remap=True,\n        >>>         share_random_param=True,\n        >>>         transforms=[\n        >>>             dict(type='Crop', crop_size=(384, 384)),\n        >>>             dict(type='Normalize'),\n        >>>         ])\n        >>> ]\n\n        >>> Example 3: Set ignored keys in broadcasting\n        >>> pipeline = [\n        >>>        dict(type='TransformBroadcaster',\n        >>>            # Broadcast the wrapped transforms to multiple images\n        >>>            # 'lq' and 'gt, but only update 'img_shape' once\n        >>>            mapping={\n        >>>                'img': ['lq', 'gt'],\n        >>>                'img_shape': ['img_shape', ...],\n        >>>             },\n        >>>            auto_remap=True,\n        >>>            share_random_params=True,\n        >>>            transforms=[\n        >>>                # `RandomCrop` will modify the field \"img\",\n        >>>                # and optionally update \"img_shape\" if it exists\n        >>>                dict(type='RandomCrop'),\n        >>>            ])\n        >>>    ]\n    \"\"\"\n\n    def __init__(self,\n                 transforms: List[Union[Dict, Callable[[Dict], Dict]]],\n                 mapping: Optional[Dict] = None,\n                 remapping: Optional[Dict] = None,\n                 auto_remap: Optional[bool] = None,\n                 allow_nonexist_keys: bool = False,\n                 share_random_params: bool = False):\n        super().__init__(transforms, mapping, remapping, auto_remap,\n                         allow_nonexist_keys)\n\n        self.share_random_params = share_random_params\n\n    def scatter_sequence(self, data: Dict) -> List[Dict]:\n        \"\"\"Scatter the broadcasting targets to a list of inputs of the wrapped\n        transforms.\"\"\"\n\n        # infer split number from input\n        seq_len = 0\n        key_rep = None\n\n        if self.mapping:\n            keys = self.mapping.keys()\n        else:\n            keys = data.keys()\n\n        for key in keys:\n            assert isinstance(data[key], Sequence)\n            if seq_len:\n                if len(data[key]) != seq_len:\n                    raise ValueError('Got inconsistent sequence length: '\n                                     f'{seq_len} ({key_rep}) vs. '\n                                     f'{len(data[key])} ({key})')\n            else:\n                seq_len = len(data[key])\n                key_rep = key\n\n        assert seq_len > 0, 'Fail to get the number of broadcasting targets'\n\n        scatters = []\n        for i in range(seq_len):  # type: ignore\n            scatter = data.copy()\n            for key in keys:\n                scatter[key] = data[key][i]\n            scatters.append(scatter)\n        return scatters\n\n    def transform(self, results: Dict):\n        \"\"\"Broadcast wrapped transforms to multiple targets.\"\"\"\n\n        # Apply input remapping\n        inputs = self._map_input(results, self.mapping)\n\n        # Scatter sequential inputs into a list\n        input_scatters = self.scatter_sequence(inputs)\n\n        # Control random parameter sharing with a context manager\n        if self.share_random_params:\n            # The context manager :func`:cache_random_params` will let\n            # cacheable method of the transforms cache their outputs. Thus\n            # the random parameters will only generated once and shared\n            # by all data items.\n            ctx = cache_random_params  # type: ignore\n        else:\n            ctx = nullcontext  # type: ignore\n\n        with ctx(self.transforms):\n            output_scatters = [\n                self._apply_transforms(_input) for _input in input_scatters\n            ]\n\n        # Collate output scatters (list of dict to dict of list)\n        outputs = {\n            key: [_output[key] for _output in output_scatters]\n            for key in output_scatters[0]\n        }\n\n        # Apply remapping\n        outputs = self._map_output(outputs, self.remapping)\n\n        results.update(outputs)\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(transforms = {self.transforms}'\n        repr_str += f', mapping = {self.mapping}'\n        repr_str += f', remapping = {self.remapping}'\n        repr_str += f', auto_remap = {self.auto_remap}'\n        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys}'\n        repr_str += f', share_random_params = {self.share_random_params})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomChoice(BaseTransform):\n    \"\"\"Process data with a randomly chosen transform from given candidates.\n\n    Args:\n        transforms (list[list]): A list of transform candidates, each is a\n            sequence of transforms.\n        prob (list[float], optional): The probabilities associated\n            with each pipeline. The length should be equal to the pipeline\n            number and the sum should be 1. If not given, a uniform\n            distribution will be assumed.\n\n    Examples:\n        >>> # config\n        >>> pipeline = [\n        >>>     dict(type='RandomChoice',\n        >>>         transforms=[\n        >>>             [dict(type='RandomHorizontalFlip')],  # subpipeline 1\n        >>>             [dict(type='RandomRotate')],  # subpipeline 2\n        >>>         ]\n        >>>     )\n        >>> ]\n    \"\"\"\n\n    def __init__(self,\n                 transforms: List[Union[Transform, List[Transform]]],\n                 prob: Optional[List[float]] = None):\n\n        super().__init__()\n\n        if prob is not None:\n            assert mmengine.is_seq_of(prob, float)\n            assert len(transforms) == len(prob), \\\n                '``transforms`` and ``prob`` must have same lengths. ' \\\n                f'Got {len(transforms)} vs {len(prob)}.'\n            assert sum(prob) == 1\n\n        self.prob = prob\n        self.transforms = [Compose(transforms) for transforms in transforms]\n\n    def __iter__(self):\n        return iter(self.transforms)\n\n    @cache_randomness\n    def random_pipeline_index(self) -> int:\n        \"\"\"Return a random transform index.\"\"\"\n        indices = np.arange(len(self.transforms))\n        return np.random.choice(indices, p=self.prob)\n\n    def transform(self, results: Dict) -> Optional[Dict]:\n        \"\"\"Randomly choose a transform to apply.\"\"\"\n        idx = self.random_pipeline_index()\n        return self.transforms[idx](results)\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(transforms = {self.transforms}'\n        repr_str += f'prob = {self.prob})'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomApply(BaseTransform):\n    \"\"\"Apply transforms randomly with a given probability.\n\n    Args:\n        transforms (list[dict | callable]): The transform or transform list\n            to randomly apply.\n        prob (float): The probability to apply transforms. Default: 0.5\n\n    Examples:\n        >>> # config\n        >>> pipeline = [\n        >>>     dict(type='RandomApply',\n        >>>         transforms=[dict(type='HorizontalFlip')],\n        >>>         prob=0.3)\n        >>> ]\n    \"\"\"\n\n    def __init__(self,\n                 transforms: Union[Transform, List[Transform]],\n                 prob: float = 0.5):\n\n        super().__init__()\n        self.prob = prob\n        self.transforms = Compose(transforms)\n\n    def __iter__(self):\n        return iter(self.transforms)\n\n    @cache_randomness\n    def random_apply(self) -> bool:\n        \"\"\"Return a random bool value indicating whether apply the\n        transform.\"\"\"\n        return np.random.rand() < self.prob\n\n    def transform(self, results: Dict) -> Optional[Dict]:\n        \"\"\"Randomly apply the transform.\"\"\"\n        if self.random_apply():\n            return self.transforms(results)  # type: ignore\n        else:\n            return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'(transforms = {self.transforms}'\n        repr_str += f', prob = {self.prob})'\n        return repr_str\n"
  },
  {
    "path": "mmcv/utils/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,\n                          IS_MPS_AVAILABLE, IS_MUSA_AVAILABLE,\n                          IS_NPU_AVAILABLE)\nfrom .env import collect_env\nfrom .parrots_jit import jit, skip_no_elena\n\n__all__ = [\n    'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',\n    'IS_NPU_AVAILABLE', 'IS_MUSA_AVAILABLE', 'collect_env', 'jit',\n    'skip_no_elena'\n]\n"
  },
  {
    "path": "mmcv/utils/device_type.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom mmengine.device import (is_cuda_available, is_mlu_available,\n                             is_mps_available, is_musa_available,\n                             is_npu_available)\n\nIS_MLU_AVAILABLE = is_mlu_available()\nIS_MPS_AVAILABLE = is_mps_available()\nIS_CUDA_AVAILABLE = is_cuda_available()\nIS_NPU_AVAILABLE = is_npu_available()\nIS_MUSA_AVAILABLE = is_musa_available()\n"
  },
  {
    "path": "mmcv/utils/env.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\"\"\"This file holding some environment constant for sharing by other files.\"\"\"\n\nimport os.path as osp\nimport subprocess\n\nimport torch\nfrom mmengine.utils.dl_utils import collect_env as mmengine_collect_env\n\nimport mmcv\n\n\ndef collect_env():\n    \"\"\"Collect the information of the running environments.\n\n    Returns:\n        dict: The environment information. The following fields are contained.\n\n            - sys.platform: The variable of ``sys.platform``.\n            - Python: Python version.\n            - CUDA available: Bool, indicating if CUDA is available.\n            - GPU devices: Device type of each GPU.\n            - CUDA_HOME (optional): The env var ``CUDA_HOME``.\n            - NVCC (optional): NVCC version.\n            - GCC: GCC version, \"n/a\" if GCC is not installed.\n            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.\n            - PyTorch: PyTorch version.\n            - PyTorch compiling details: The output of \\\n                ``torch.__config__.show()``.\n            - TorchVision (optional): TorchVision version.\n            - OpenCV: OpenCV version.\n            - MMEngine: MMEngine version.\n            - MMCV: MMCV version.\n            - MMCV Compiler: The GCC version for compiling MMCV ops.\n            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.\n    \"\"\"\n    env_info = mmengine_collect_env()\n\n    # MMEngine does not add the hipcc compiler information when collecting\n    # environment information, so it is added here. When MMEngine v0.3.0 is\n    # released, the code here can be removed.\n    cuda_available = torch.cuda.is_available()\n    if cuda_available and env_info.get('NVCC') == 'Not Available':\n        CUDA_HOME = env_info['CUDA_HOME']\n        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):\n            if CUDA_HOME == '/opt/rocm':\n                try:\n                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')\n                    nvcc = subprocess.check_output(\n                        f'\"{nvcc}\" --version', shell=True)\n                    nvcc = nvcc.decode('utf-8').strip()\n                    release = nvcc.rfind('HIP version:')\n                    build = nvcc.rfind('')\n                    nvcc = nvcc[release:build].strip()\n                except subprocess.SubprocessError:\n                    nvcc = 'Not Available'\n            else:\n                try:\n                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')\n                    nvcc = subprocess.check_output(f'\"{nvcc}\" -V', shell=True)\n                    nvcc = nvcc.decode('utf-8').strip()\n                    release = nvcc.rfind('Cuda compilation tools')\n                    build = nvcc.rfind('Build ')\n                    nvcc = nvcc[release:build].strip()\n                except subprocess.SubprocessError:\n                    nvcc = 'Not Available'\n            env_info['NVCC'] = nvcc\n\n    env_info['MMCV'] = mmcv.__version__\n\n    try:\n        from mmcv.ops import get_compiler_version, get_compiling_cuda_version\n    except ModuleNotFoundError:\n        env_info['MMCV Compiler'] = 'n/a'\n        env_info['MMCV CUDA Compiler'] = 'n/a'\n    else:\n        env_info['MMCV Compiler'] = get_compiler_version()\n        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()\n\n    return env_info\n"
  },
  {
    "path": "mmcv/utils/ext_loader.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport importlib\nimport os\nimport pkgutil\nimport warnings\nfrom collections import namedtuple\n\nimport torch\n\nif torch.__version__ != 'parrots':\n\n    def load_ext(name, funcs):\n        ext = importlib.import_module('mmcv.' + name)\n        for fun in funcs:\n            assert hasattr(ext, fun), f'{fun} miss in module {name}'\n        return ext\nelse:\n    from parrots import extension\n    from parrots.base import ParrotsException\n\n    has_return_value_ops = [\n        'nms',\n        'softnms',\n        'nms_match',\n        'nms_rotated',\n        'top_pool_forward',\n        'top_pool_backward',\n        'bottom_pool_forward',\n        'bottom_pool_backward',\n        'left_pool_forward',\n        'left_pool_backward',\n        'right_pool_forward',\n        'right_pool_backward',\n        'fused_bias_leakyrelu',\n        'upfirdn2d',\n        'ms_deform_attn_forward',\n        'pixel_group',\n        'contour_expand',\n        'diff_iou_rotated_sort_vertices_forward',\n    ]\n\n    def get_fake_func(name, e):\n\n        def fake_func(*args, **kwargs):\n            warnings.warn(f'{name} is not supported in parrots now')\n            raise e\n\n        return fake_func\n\n    def load_ext(name, funcs):\n        ExtModule = namedtuple('ExtModule', funcs)\n        ext_list = []\n        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))\n        for fun in funcs:\n            try:\n                ext_fun = extension.load(fun, name, lib_dir=lib_root)\n            except ParrotsException as e:\n                if 'No element registered' not in e.message:\n                    warnings.warn(e.message)\n                ext_fun = get_fake_func(fun, e)\n                ext_list.append(ext_fun)\n            else:\n                if fun in has_return_value_ops:\n                    ext_list.append(ext_fun.op)\n                else:\n                    ext_list.append(ext_fun.op_)\n        return ExtModule(*ext_list)\n\n\ndef check_ops_exist() -> bool:\n    ext_loader = pkgutil.find_loader('mmcv._ext')\n    return ext_loader is not None\n"
  },
  {
    "path": "mmcv/utils/parrots_jit.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nfrom mmengine.utils.dl_utils.parrots_wrapper import TORCH_VERSION\n\nparrots_jit_option = os.getenv('PARROTS_JIT_OPTION')\n\nif TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':\n    from parrots.jit import pat as jit\nelse:\n\n    def jit(func=None,\n            check_input=None,\n            full_shape=True,\n            derivate=False,\n            coderize=False,\n            optimize=False):\n\n        def wrapper(func):\n\n            def wrapper_inner(*args, **kargs):\n                return func(*args, **kargs)\n\n            return wrapper_inner\n\n        if func is None:\n            return wrapper\n        else:\n            return func\n\n\nif TORCH_VERSION == 'parrots':\n    from parrots.utils.tester import skip_no_elena\nelse:\n\n    def skip_no_elena(func):\n\n        def wrapper(*args, **kargs):\n            return func(*args, **kargs)\n\n        return wrapper\n"
  },
  {
    "path": "mmcv/version.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n__version__ = '2.2.0'\n\n\ndef parse_version_info(version_str: str, length: int = 4) -> tuple:\n    \"\"\"Parse a version string into a tuple.\n\n    Args:\n        version_str (str): The version string.\n        length (int): The maximum number of version levels. Default: 4.\n\n    Returns:\n        tuple[int | str]: The version info, e.g., \"1.3.0\" is parsed into\n            (1, 3, 0, 0, 0, 0), and \"2.0.0rc1\" is parsed into\n            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).\n    \"\"\"\n    from packaging.version import parse\n    version = parse(version_str)\n    assert version.release, f'failed to parse version {version_str}'\n    release = list(version.release)\n    release = release[:length]\n    if len(release) < length:\n        release = release + [0] * (length - len(release))\n    if version.is_prerelease:\n        release.extend(list(version.pre))  # type: ignore\n    elif version.is_postrelease:\n        release.extend(list(version.post))  # type: ignore\n    else:\n        release.extend([0, 0])\n    return tuple(release)\n\n\nversion_info = tuple(int(x) for x in __version__.split('.')[:3])\n\n__all__ = ['__version__', 'version_info', 'parse_version_info']\n"
  },
  {
    "path": "mmcv/video/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .io import Cache, VideoReader, frames2video\nfrom .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,\n                      flowwrite, quantize_flow, sparse_flow_from_bytes)\nfrom .processing import concat_video, convert_video, cut_video, resize_video\n\n__all__ = [\n    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',\n    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',\n    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'\n]\n"
  },
  {
    "path": "mmcv/video/io.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os.path as osp\nfrom collections import OrderedDict\n\nimport cv2\nfrom cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,\n                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,\n                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)\nfrom mmengine.utils import (check_file_exist, mkdir_or_exist, scandir,\n                            track_progress)\n\n\nclass Cache:\n\n    def __init__(self, capacity):\n        self._cache = OrderedDict()\n        self._capacity = int(capacity)\n        if capacity <= 0:\n            raise ValueError('capacity must be a positive integer')\n\n    @property\n    def capacity(self):\n        return self._capacity\n\n    @property\n    def size(self):\n        return len(self._cache)\n\n    def put(self, key, val):\n        if key in self._cache:\n            return\n        if len(self._cache) >= self.capacity:\n            self._cache.popitem(last=False)\n        self._cache[key] = val\n\n    def get(self, key, default=None):\n        val = self._cache[key] if key in self._cache else default\n        return val\n\n\nclass VideoReader:\n    \"\"\"Video class with similar usage to a list object.\n\n    This video wrapper class provides convenient apis to access frames.\n    There exists an issue of OpenCV's VideoCapture class that jumping to a\n    certain frame may be inaccurate. It is fixed in this class by checking\n    the position after jumping each time.\n    Cache is used when decoding videos. So if the same frame is visited for\n    the second time, there is no need to decode again if it is stored in the\n    cache.\n\n    Examples:\n        >>> import mmcv\n        >>> v = mmcv.VideoReader('sample.mp4')\n        >>> len(v)  # get the total frame number with `len()`\n        120\n        >>> for img in v:  # v is iterable\n        >>>     mmcv.imshow(img)\n        >>> v[5]  # get the 6th frame\n    \"\"\"\n\n    def __init__(self, filename, cache_capacity=10):\n        # Check whether the video path is a url\n        if not filename.startswith(('https://', 'http://')):\n            check_file_exist(filename, 'Video file not found: ' + filename)\n        self._vcap = cv2.VideoCapture(filename)\n        assert cache_capacity > 0\n        self._cache = Cache(cache_capacity)\n        self._position = 0\n        # get basic info\n        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))\n        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))\n        self._fps = self._vcap.get(CAP_PROP_FPS)\n        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))\n        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)\n\n    @property\n    def vcap(self):\n        \"\"\":obj:`cv2.VideoCapture`: The raw VideoCapture object.\"\"\"\n        return self._vcap\n\n    @property\n    def opened(self):\n        \"\"\"bool: Indicate whether the video is opened.\"\"\"\n        return self._vcap.isOpened()\n\n    @property\n    def width(self):\n        \"\"\"int: Width of video frames.\"\"\"\n        return self._width\n\n    @property\n    def height(self):\n        \"\"\"int: Height of video frames.\"\"\"\n        return self._height\n\n    @property\n    def resolution(self):\n        \"\"\"tuple: Video resolution (width, height).\"\"\"\n        return (self._width, self._height)\n\n    @property\n    def fps(self):\n        \"\"\"float: FPS of the video.\"\"\"\n        return self._fps\n\n    @property\n    def frame_cnt(self):\n        \"\"\"int: Total frames of the video.\"\"\"\n        return self._frame_cnt\n\n    @property\n    def fourcc(self):\n        \"\"\"str: \"Four character code\" of the video.\"\"\"\n        return self._fourcc\n\n    @property\n    def position(self):\n        \"\"\"int: Current cursor position, indicating frame decoded.\"\"\"\n        return self._position\n\n    def _get_real_position(self):\n        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))\n\n    def _set_real_position(self, frame_id):\n        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)\n        pos = self._get_real_position()\n        for _ in range(frame_id - pos):\n            self._vcap.read()\n        self._position = frame_id\n\n    def read(self):\n        \"\"\"Read the next frame.\n\n        If the next frame have been decoded before and in the cache, then\n        return it directly, otherwise decode, cache and return it.\n\n        Returns:\n            ndarray or None: Return the frame if successful, otherwise None.\n        \"\"\"\n        # pos = self._position\n        if self._cache:\n            img = self._cache.get(self._position)\n            if img is not None:\n                ret = True\n            else:\n                if self._position != self._get_real_position():\n                    self._set_real_position(self._position)\n                ret, img = self._vcap.read()\n                if ret:\n                    self._cache.put(self._position, img)\n        else:\n            ret, img = self._vcap.read()\n        if ret:\n            self._position += 1\n        return img\n\n    def get_frame(self, frame_id):\n        \"\"\"Get frame by index.\n\n        Args:\n            frame_id (int): Index of the expected frame, 0-based.\n\n        Returns:\n            ndarray or None: Return the frame if successful, otherwise None.\n        \"\"\"\n        if frame_id < 0 or frame_id >= self._frame_cnt:\n            raise IndexError(\n                f'\"frame_id\" must be between 0 and {self._frame_cnt - 1}')\n        if frame_id == self._position:\n            return self.read()\n        if self._cache:\n            img = self._cache.get(frame_id)\n            if img is not None:\n                self._position = frame_id + 1\n                return img\n        self._set_real_position(frame_id)\n        ret, img = self._vcap.read()\n        if ret:\n            if self._cache:\n                self._cache.put(self._position, img)\n            self._position += 1\n        return img\n\n    def current_frame(self):\n        \"\"\"Get the current frame (frame that is just visited).\n\n        Returns:\n            ndarray or None: If the video is fresh, return None, otherwise\n            return the frame.\n        \"\"\"\n        if self._position == 0:\n            return None\n        return self._cache.get(self._position - 1)\n\n    def cvt2frames(self,\n                   frame_dir,\n                   file_start=0,\n                   filename_tmpl='{:06d}.jpg',\n                   start=0,\n                   max_num=0,\n                   show_progress=True):\n        \"\"\"Convert a video to frame images.\n\n        Args:\n            frame_dir (str): Output directory to store all the frame images.\n            file_start (int): Filenames will start from the specified number.\n            filename_tmpl (str): Filename template with the index as the\n                placeholder.\n            start (int): The starting frame index.\n            max_num (int): Maximum number of frames to be written.\n            show_progress (bool): Whether to show a progress bar.\n        \"\"\"\n        mkdir_or_exist(frame_dir)\n        if max_num == 0:\n            task_num = self.frame_cnt - start\n        else:\n            task_num = min(self.frame_cnt - start, max_num)\n        if task_num <= 0:\n            raise ValueError('start must be less than total frame number')\n        if start > 0:\n            self._set_real_position(start)\n\n        def write_frame(file_idx):\n            img = self.read()\n            if img is None:\n                return\n            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))\n            cv2.imwrite(filename, img)\n\n        if show_progress:\n            track_progress(write_frame, range(file_start,\n                                              file_start + task_num))\n        else:\n            for i in range(task_num):\n                write_frame(file_start + i)\n\n    def __len__(self):\n        return self.frame_cnt\n\n    def __getitem__(self, index):\n        if isinstance(index, slice):\n            return [\n                self.get_frame(i)\n                for i in range(*index.indices(self.frame_cnt))\n            ]\n        # support negative indexing\n        if index < 0:\n            index += self.frame_cnt\n            if index < 0:\n                raise IndexError('index out of range')\n        return self.get_frame(index)\n\n    def __iter__(self):\n        self._set_real_position(0)\n        return self\n\n    def __next__(self):\n        img = self.read()\n        if img is not None:\n            return img\n        else:\n            raise StopIteration\n\n    next = __next__\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        self._vcap.release()\n\n\ndef frames2video(frame_dir: str,\n                 video_file: str,\n                 fps: float = 30,\n                 fourcc: str = 'XVID',\n                 filename_tmpl: str = '{:06d}.jpg',\n                 start: int = 0,\n                 end: int = 0,\n                 show_progress: bool = True) -> None:\n    \"\"\"Read the frame images from a directory and join them as a video.\n\n    Args:\n        frame_dir (str): The directory containing video frames.\n        video_file (str): Output filename.\n        fps (float): FPS of the output video.\n        fourcc (str): Fourcc of the output video, this should be compatible\n            with the output file type.\n        filename_tmpl (str): Filename template with the index as the variable.\n        start (int): Starting frame index.\n        end (int): Ending frame index.\n        show_progress (bool): Whether to show a progress bar.\n    \"\"\"\n    if end == 0:\n        ext = filename_tmpl.split('.')[-1]\n        end = len([name for name in scandir(frame_dir, ext)])\n    first_file = osp.join(frame_dir, filename_tmpl.format(start))\n    check_file_exist(first_file, 'The start frame not found: ' + first_file)\n    img = cv2.imread(first_file)\n    height, width = img.shape[:2]\n    resolution = (width, height)\n    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,\n                              resolution)\n\n    def write_frame(file_idx):\n        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))\n        img = cv2.imread(filename)\n        vwriter.write(img)\n\n    if show_progress:\n        track_progress(write_frame, range(start, end))\n    else:\n        for i in range(start, end):\n            write_frame(i)\n    vwriter.release()\n"
  },
  {
    "path": "mmcv/video/optflow.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom typing import Tuple, Union\n\nimport cv2\nimport numpy as np\nfrom mmengine.utils import is_str\n\nfrom mmcv.arraymisc import dequantize, quantize\nfrom mmcv.image import imread, imwrite\n\n\ndef flowread(flow_or_path: Union[np.ndarray, str],\n             quantize: bool = False,\n             concat_axis: int = 0,\n             *args,\n             **kwargs) -> np.ndarray:\n    \"\"\"Read an optical flow map.\n\n    Args:\n        flow_or_path (ndarray or str): A flow map or filepath.\n        quantize (bool): whether to read quantized pair, if set to True,\n            remaining args will be passed to :func:`dequantize_flow`.\n        concat_axis (int): The axis that dx and dy are concatenated,\n            can be either 0 or 1. Ignored if quantize is False.\n\n    Returns:\n        ndarray: Optical flow represented as a (h, w, 2) numpy array\n    \"\"\"\n    if isinstance(flow_or_path, np.ndarray):\n        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):\n            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')\n        return flow_or_path\n    elif not is_str(flow_or_path):\n        raise TypeError(f'\"flow_or_path\" must be a filename or numpy array, '\n                        f'not {type(flow_or_path)}')\n\n    if not quantize:\n        with open(flow_or_path, 'rb') as f:\n            try:\n                header = f.read(4).decode('utf-8')\n            except Exception:\n                raise OSError(f'Invalid flow file: {flow_or_path}')\n            else:\n                if header != 'PIEH':\n                    raise OSError(f'Invalid flow file: {flow_or_path}, '\n                                  'header does not contain PIEH')\n\n            w = np.fromfile(f, np.int32, 1).squeeze()\n            h = np.fromfile(f, np.int32, 1).squeeze()\n            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))\n    else:\n        assert concat_axis in [0, 1]\n        cat_flow = imread(flow_or_path, flag='unchanged')\n        if cat_flow.ndim != 2:\n            raise OSError(\n                f'{flow_or_path} is not a valid quantized flow file, '\n                f'its dimension is {cat_flow.ndim}.')\n        assert cat_flow.shape[concat_axis] % 2 == 0\n        dx, dy = np.split(cat_flow, 2, axis=concat_axis)\n        flow = dequantize_flow(dx, dy, *args, **kwargs)\n\n    return flow.astype(np.float32)\n\n\ndef flowwrite(flow: np.ndarray,\n              filename: str,\n              quantize: bool = False,\n              concat_axis: int = 0,\n              *args,\n              **kwargs) -> None:\n    \"\"\"Write optical flow to file.\n\n    If the flow is not quantized, it will be saved as a .flo file losslessly,\n    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy\n    will be concatenated horizontally into a single image if quantize is True.)\n\n    Args:\n        flow (ndarray): (h, w, 2) array of optical flow.\n        filename (str): Output filepath.\n        quantize (bool): Whether to quantize the flow and save it to 2 jpeg\n            images. If set to True, remaining args will be passed to\n            :func:`quantize_flow`.\n        concat_axis (int): The axis that dx and dy are concatenated,\n            can be either 0 or 1. Ignored if quantize is False.\n    \"\"\"\n    if not quantize:\n        with open(filename, 'wb') as f:\n            f.write(b'PIEH')\n            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)\n            flow = flow.astype(np.float32)\n            flow.tofile(f)\n            f.flush()\n    else:\n        assert concat_axis in [0, 1]\n        dx, dy = quantize_flow(flow, *args, **kwargs)\n        dxdy = np.concatenate((dx, dy), axis=concat_axis)\n        imwrite(dxdy, filename)\n\n\ndef quantize_flow(flow: np.ndarray,\n                  max_val: float = 0.02,\n                  norm: bool = True) -> tuple:\n    \"\"\"Quantize flow to [0, 255].\n\n    After this step, the size of flow will be much smaller, and can be\n    dumped as jpeg images.\n\n    Args:\n        flow (ndarray): (h, w, 2) array of optical flow.\n        max_val (float): Maximum value of flow, values beyond\n                        [-max_val, max_val] will be truncated.\n        norm (bool): Whether to divide flow values by image width/height.\n\n    Returns:\n        tuple[ndarray]: Quantized dx and dy.\n    \"\"\"\n    h, w, _ = flow.shape\n    dx = flow[..., 0]\n    dy = flow[..., 1]\n    if norm:\n        dx = dx / w  # avoid inplace operations\n        dy = dy / h\n    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.\n    flow_comps = [\n        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]\n    ]\n    return tuple(flow_comps)\n\n\ndef dequantize_flow(dx: np.ndarray,\n                    dy: np.ndarray,\n                    max_val: float = 0.02,\n                    denorm: bool = True) -> np.ndarray:\n    \"\"\"Recover from quantized flow.\n\n    Args:\n        dx (ndarray): Quantized dx.\n        dy (ndarray): Quantized dy.\n        max_val (float): Maximum value used when quantizing.\n        denorm (bool): Whether to multiply flow values with width/height.\n\n    Returns:\n        ndarray: Dequantized flow.\n    \"\"\"\n    assert dx.shape == dy.shape\n    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)\n\n    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])\n\n    if denorm:\n        dx *= dx.shape[1]  # type: ignore\n        dy *= dx.shape[0]  # type: ignore\n    flow = np.dstack((dx, dy))\n    return flow\n\n\ndef flow_warp(img: np.ndarray,\n              flow: np.ndarray,\n              filling_value: int = 0,\n              interpolate_mode: str = 'nearest') -> np.ndarray:\n    \"\"\"Use flow to warp img.\n\n    Args:\n        img (ndarray): Image to be warped.\n        flow (ndarray): Optical Flow.\n        filling_value (int): The missing pixels will be set with filling_value.\n        interpolate_mode (str): bilinear -> Bilinear Interpolation;\n                                nearest -> Nearest Neighbor.\n\n    Returns:\n        ndarray: Warped image with the same shape of img\n    \"\"\"\n    warnings.warn('This function is just for prototyping and cannot '\n                  'guarantee the computational efficiency.')\n    assert flow.ndim == 3, 'Flow must be in 3D arrays.'\n    height = flow.shape[0]\n    width = flow.shape[1]\n    channels = img.shape[2]\n\n    output = np.ones(\n        (height, width, channels), dtype=img.dtype) * filling_value\n\n    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)\n    dx = grid[:, :, 0] + flow[:, :, 1]\n    dy = grid[:, :, 1] + flow[:, :, 0]\n    sx = np.floor(dx).astype(int)\n    sy = np.floor(dy).astype(int)\n    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)\n\n    if interpolate_mode == 'nearest':\n        output[valid, :] = img[dx[valid].round().astype(int),\n                               dy[valid].round().astype(int), :]\n    elif interpolate_mode == 'bilinear':\n        # dirty walkround for integer positions\n        eps_ = 1e-6\n        dx, dy = dx + eps_, dy + eps_\n        left_top_ = img[np.floor(dx[valid]).astype(int),\n                        np.floor(dy[valid]).astype(int), :] * (\n                            np.ceil(dx[valid]) - dx[valid])[:, None] * (\n                                np.ceil(dy[valid]) - dy[valid])[:, None]\n        left_down_ = img[np.ceil(dx[valid]).astype(int),\n                         np.floor(dy[valid]).astype(int), :] * (\n                             dx[valid] - np.floor(dx[valid]))[:, None] * (\n                                 np.ceil(dy[valid]) - dy[valid])[:, None]\n        right_top_ = img[np.floor(dx[valid]).astype(int),\n                         np.ceil(dy[valid]).astype(int), :] * (\n                             np.ceil(dx[valid]) - dx[valid])[:, None] * (\n                                 dy[valid] - np.floor(dy[valid]))[:, None]\n        right_down_ = img[np.ceil(dx[valid]).astype(int),\n                          np.ceil(dy[valid]).astype(int), :] * (\n                              dx[valid] - np.floor(dx[valid]))[:, None] * (\n                                  dy[valid] - np.floor(dy[valid]))[:, None]\n        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_\n    else:\n        raise NotImplementedError(\n            'We only support interpolation modes of nearest and bilinear, '\n            f'but got {interpolate_mode}.')\n    return output.astype(img.dtype)\n\n\ndef flow_from_bytes(content: bytes) -> np.ndarray:\n    \"\"\"Read dense optical flow from bytes.\n\n    .. note::\n        This load optical flow function works for FlyingChairs, FlyingThings3D,\n        Sintel, FlyingChairsOcc datasets, but cannot load the data from\n        ChairsSDHom.\n\n    Args:\n        content (bytes): Optical flow bytes got from files or other streams.\n\n    Returns:\n        ndarray: Loaded optical flow with the shape (H, W, 2).\n    \"\"\"\n\n    # header in first 4 bytes\n    header = content[:4]\n    if header.decode('utf-8') != 'PIEH':\n        raise Exception('Flow file header does not contain PIEH')\n    # width in second 4 bytes\n    width = np.frombuffer(content[4:], np.int32, 1).squeeze()\n    # height in third 4 bytes\n    height = np.frombuffer(content[8:], np.int32, 1).squeeze()\n    # after first 12 bytes, all bytes are flow\n    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(\n        (height, width, 2))\n\n    return flow\n\n\ndef sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:\n    \"\"\"Read the optical flow in KITTI datasets from bytes.\n\n    This function is modified from RAFT load the `KITTI datasets\n    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.\n\n    Args:\n        content (bytes): Optical flow bytes got from files or other streams.\n\n    Returns:\n        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)\n        and flow valid mask with the shape (H, W).\n    \"\"\"  # nopa\n\n    content = np.frombuffer(content, np.uint8)\n    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)\n    flow = flow[:, :, ::-1].astype(np.float32)\n    # flow shape (H, W, 2) valid shape (H, W)\n    flow, valid = flow[:, :, :2], flow[:, :, 2]\n    flow = (flow - 2**15) / 64.0\n    return flow, valid\n"
  },
  {
    "path": "mmcv/video/processing.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport os.path as osp\nimport subprocess\nimport tempfile\nfrom typing import List, Optional, Union\n\nfrom mmengine.utils import requires_executable\n\n\n@requires_executable('ffmpeg')\ndef convert_video(in_file: str,\n                  out_file: str,\n                  print_cmd: bool = False,\n                  pre_options: str = '',\n                  **kwargs) -> None:\n    \"\"\"Convert a video with ffmpeg.\n\n    This provides a general api to ffmpeg, the executed command is::\n\n        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`\n\n    Options(kwargs) are mapped to ffmpeg commands with the following rules:\n\n    - key=val: \"-key val\"\n    - key=True: \"-key\"\n    - key=False: \"\"\n\n    Args:\n        in_file (str): Input video filename.\n        out_file (str): Output video filename.\n        pre_options (str): Options appears before \"-i <in_file>\".\n        print_cmd (bool): Whether to print the final ffmpeg command.\n    \"\"\"\n    options = []\n    for k, v in kwargs.items():\n        if isinstance(v, bool):\n            if v:\n                options.append(f'-{k}')\n        elif k == 'log_level':\n            assert v in [\n                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',\n                'verbose', 'debug', 'trace'\n            ]\n            options.append(f'-loglevel {v}')\n        else:\n            options.append(f'-{k} {v}')\n    cmd = f'ffmpeg -y {pre_options} -i {in_file} {\" \".join(options)} ' \\\n          f'{out_file}'\n    if print_cmd:\n        print(cmd)\n    subprocess.call(cmd, shell=True)\n\n\n@requires_executable('ffmpeg')\ndef resize_video(in_file: str,\n                 out_file: str,\n                 size: Optional[tuple] = None,\n                 ratio: Union[tuple, float, None] = None,\n                 keep_ar: bool = False,\n                 log_level: str = 'info',\n                 print_cmd: bool = False) -> None:\n    \"\"\"Resize a video.\n\n    Args:\n        in_file (str): Input video filename.\n        out_file (str): Output video filename.\n        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).\n        ratio (tuple or float): Expected resize ratio, (2, 0.5) means\n            (w*2, h*0.5).\n        keep_ar (bool): Whether to keep original aspect ratio.\n        log_level (str): Logging level of ffmpeg.\n        print_cmd (bool): Whether to print the final ffmpeg command.\n    \"\"\"\n    if size is None and ratio is None:\n        raise ValueError('expected size or ratio must be specified')\n    if size is not None and ratio is not None:\n        raise ValueError('size and ratio cannot be specified at the same time')\n    options = {'log_level': log_level}\n    if size:\n        if not keep_ar:\n            options['vf'] = f'scale={size[0]}:{size[1]}'\n        else:\n            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \\\n                            'force_original_aspect_ratio=decrease'\n    else:\n        if not isinstance(ratio, tuple):\n            ratio = (ratio, ratio)\n        options['vf'] = f'scale=\"trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})\"'\n    convert_video(in_file, out_file, print_cmd, **options)\n\n\n@requires_executable('ffmpeg')\ndef cut_video(in_file: str,\n              out_file: str,\n              start: Optional[float] = None,\n              end: Optional[float] = None,\n              vcodec: Optional[str] = None,\n              acodec: Optional[str] = None,\n              log_level: str = 'info',\n              print_cmd: bool = False) -> None:\n    \"\"\"Cut a clip from a video.\n\n    Args:\n        in_file (str): Input video filename.\n        out_file (str): Output video filename.\n        start (None or float): Start time (in seconds).\n        end (None or float): End time (in seconds).\n        vcodec (None or str): Output video codec, None for unchanged.\n        acodec (None or str): Output audio codec, None for unchanged.\n        log_level (str): Logging level of ffmpeg.\n        print_cmd (bool): Whether to print the final ffmpeg command.\n    \"\"\"\n    options = {'log_level': log_level}\n    if vcodec is None:\n        options['vcodec'] = 'copy'\n    if acodec is None:\n        options['acodec'] = 'copy'\n    if start:\n        options['ss'] = start  # type: ignore\n    else:\n        start = 0\n    if end:\n        options['t'] = end - start  # type: ignore\n    convert_video(in_file, out_file, print_cmd, **options)\n\n\n@requires_executable('ffmpeg')\ndef concat_video(video_list: List,\n                 out_file: str,\n                 vcodec: Optional[str] = None,\n                 acodec: Optional[str] = None,\n                 log_level: str = 'info',\n                 print_cmd: bool = False) -> None:\n    \"\"\"Concatenate multiple videos into a single one.\n\n    Args:\n        video_list (list): A list of video filenames\n        out_file (str): Output video filename\n        vcodec (None or str): Output video codec, None for unchanged\n        acodec (None or str): Output audio codec, None for unchanged\n        log_level (str): Logging level of ffmpeg.\n        print_cmd (bool): Whether to print the final ffmpeg command.\n    \"\"\"\n    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)\n    with open(tmp_filename, 'w') as f:\n        for filename in video_list:\n            f.write(f'file {osp.abspath(filename)}\\n')\n    options = {'log_level': log_level}\n    if vcodec is None:\n        options['vcodec'] = 'copy'\n    if acodec is None:\n        options['acodec'] = 'copy'\n    convert_video(\n        tmp_filename,\n        out_file,\n        print_cmd,\n        pre_options='-f concat -safe 0',\n        **options)\n    os.close(tmp_filehandler)\n    os.remove(tmp_filename)\n"
  },
  {
    "path": "mmcv/visualization/__init__.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom .color import Color, color_val\nfrom .image import imshow, imshow_bboxes, imshow_det_bboxes\nfrom .optflow import flow2rgb, flowshow, make_color_wheel\n\n__all__ = [\n    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',\n    'flowshow', 'flow2rgb', 'make_color_wheel'\n]\n"
  },
  {
    "path": "mmcv/visualization/color.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom enum import Enum\nfrom typing import Union\n\nimport numpy as np\nfrom mmengine.utils import is_str\n\n\nclass Color(Enum):\n    \"\"\"An enum that defines common colors.\n\n    Contains red, green, blue, cyan, yellow, magenta, white and black.\n    \"\"\"\n    red = (0, 0, 255)\n    green = (0, 255, 0)\n    blue = (255, 0, 0)\n    cyan = (255, 255, 0)\n    yellow = (0, 255, 255)\n    magenta = (255, 0, 255)\n    white = (255, 255, 255)\n    black = (0, 0, 0)\n\n\ndef color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:\n    \"\"\"Convert various input to color tuples.\n\n    Args:\n        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs\n\n    Returns:\n        tuple[int]: A tuple of 3 integers indicating BGR channels.\n    \"\"\"\n    if is_str(color):\n        return Color[color].value  # type: ignore\n    elif isinstance(color, Color):\n        return color.value\n    elif isinstance(color, tuple):\n        assert len(color) == 3\n        for channel in color:\n            assert 0 <= channel <= 255\n        return color\n    elif isinstance(color, int):\n        assert 0 <= color <= 255\n        return color, color, color\n    elif isinstance(color, np.ndarray):\n        assert color.ndim == 1 and color.size == 3\n        assert np.all((color >= 0) & (color <= 255))\n        color = color.astype(np.uint8)\n        return tuple(color)\n    else:\n        raise TypeError(f'Invalid type for color: {type(color)}')\n"
  },
  {
    "path": "mmcv/visualization/image.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import List, Optional, Union\n\nimport cv2\nimport numpy as np\n\nfrom mmcv.image import imread, imwrite\nfrom .color import Color, color_val\n\n# a type alias declares the optional types of color argument\nColorType = Union[Color, str, tuple, int, np.ndarray]\n\n\ndef imshow(img: Union[str, np.ndarray],\n           win_name: str = '',\n           wait_time: int = 0):\n    \"\"\"Show an image.\n\n    Args:\n        img (str or ndarray): The image to be displayed.\n        win_name (str): The window name.\n        wait_time (int): Value of waitKey param.\n    \"\"\"\n    cv2.imshow(win_name, imread(img))\n    if wait_time == 0:  # prevent from hanging if windows was closed\n        while True:\n            ret = cv2.waitKey(1)\n\n            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1\n            # if user closed window or if some key pressed\n            if closed or ret != -1:\n                break\n    else:\n        ret = cv2.waitKey(wait_time)\n\n\ndef imshow_bboxes(img: Union[str, np.ndarray],\n                  bboxes: Union[list, np.ndarray],\n                  colors: ColorType = 'green',\n                  top_k: int = -1,\n                  thickness: int = 1,\n                  show: bool = True,\n                  win_name: str = '',\n                  wait_time: int = 0,\n                  out_file: Optional[str] = None):\n    \"\"\"Draw bboxes on an image.\n\n    Args:\n        img (str or ndarray): The image to be displayed.\n        bboxes (list or ndarray): A list of ndarray of shape (k, 4).\n        colors (Color or str or tuple or int or ndarray): A list of colors.\n        top_k (int): Plot the first k bboxes only if set positive.\n        thickness (int): Thickness of lines.\n        show (bool): Whether to show the image.\n        win_name (str): The window name.\n        wait_time (int): Value of waitKey param.\n        out_file (str, optional): The filename to write the image.\n\n    Returns:\n        ndarray: The image with bboxes drawn on it.\n    \"\"\"\n    img = imread(img)\n    img = np.ascontiguousarray(img)\n\n    if isinstance(bboxes, np.ndarray):\n        bboxes = [bboxes]\n    if not isinstance(colors, list):\n        colors = [colors for _ in range(len(bboxes))]\n    colors = [color_val(c) for c in colors]\n    assert len(bboxes) == len(colors)\n\n    for i, _bboxes in enumerate(bboxes):\n        _bboxes = _bboxes.astype(np.int32)\n        if top_k <= 0:\n            _top_k = _bboxes.shape[0]\n        else:\n            _top_k = min(top_k, _bboxes.shape[0])\n        for j in range(_top_k):\n            left_top = (_bboxes[j, 0], _bboxes[j, 1])\n            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])\n            cv2.rectangle(\n                img, left_top, right_bottom, colors[i], thickness=thickness)\n\n    if show:\n        imshow(img, win_name, wait_time)\n    if out_file is not None:\n        imwrite(img, out_file)\n    return img\n\n\ndef imshow_det_bboxes(img: Union[str, np.ndarray],\n                      bboxes: np.ndarray,\n                      labels: np.ndarray,\n                      class_names: Optional[List[str]] = None,\n                      score_thr: float = 0,\n                      bbox_color: ColorType = 'green',\n                      text_color: ColorType = 'green',\n                      thickness: int = 1,\n                      font_scale: float = 0.5,\n                      show: bool = True,\n                      win_name: str = '',\n                      wait_time: int = 0,\n                      out_file: Optional[str] = None):\n    \"\"\"Draw bboxes and class labels (with scores) on an image.\n\n    Args:\n        img (str or ndarray): The image to be displayed.\n        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or\n            (n, 5).\n        labels (ndarray): Labels of bboxes.\n        class_names (list[str]): Names of each classes.\n        score_thr (float): Minimum score of bboxes to be shown.\n        bbox_color (Color or str or tuple or int or ndarray): Color\n            of bbox lines.\n        text_color (Color or str or tuple or int or ndarray): Color\n            of texts.\n        thickness (int): Thickness of lines.\n        font_scale (float): Font scales of texts.\n        show (bool): Whether to show the image.\n        win_name (str): The window name.\n        wait_time (int): Value of waitKey param.\n        out_file (str or None): The filename to write the image.\n\n    Returns:\n        ndarray: The image with bboxes drawn on it.\n    \"\"\"\n    assert bboxes.ndim == 2\n    assert labels.ndim == 1\n    assert bboxes.shape[0] == labels.shape[0]\n    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5\n    img = imread(img)\n    img = np.ascontiguousarray(img)\n\n    if score_thr > 0:\n        assert bboxes.shape[1] == 5\n        scores = bboxes[:, -1]\n        inds = scores > score_thr\n        bboxes = bboxes[inds, :]\n        labels = labels[inds]\n\n    bbox_color = color_val(bbox_color)\n    text_color = color_val(text_color)\n\n    for bbox, label in zip(bboxes, labels):\n        bbox_int = bbox.astype(np.int32)\n        left_top = (bbox_int[0], bbox_int[1])\n        right_bottom = (bbox_int[2], bbox_int[3])\n        cv2.rectangle(\n            img, left_top, right_bottom, bbox_color, thickness=thickness)\n        label_text = class_names[\n            label] if class_names is not None else f'cls {label}'\n        if len(bbox) > 4:\n            label_text += f'|{bbox[-1]:.02f}'\n        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),\n                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)\n\n    if show:\n        imshow(img, win_name, wait_time)\n    if out_file is not None:\n        imwrite(img, out_file)\n    return img\n"
  },
  {
    "path": "mmcv/visualization/optflow.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom typing import Optional, Union\n\nimport numpy as np\n\nfrom mmcv.image import rgb2bgr\nfrom mmcv.video import flowread\nfrom .image import imshow\n\n\ndef flowshow(flow: Union[np.ndarray, str],\n             win_name: str = '',\n             wait_time: int = 0) -> None:\n    \"\"\"Show optical flow.\n\n    Args:\n        flow (ndarray or str): The optical flow to be displayed.\n        win_name (str): The window name.\n        wait_time (int): Value of waitKey param.\n    \"\"\"\n    flow = flowread(flow)\n    flow_img = flow2rgb(flow)\n    imshow(rgb2bgr(flow_img), win_name, wait_time)\n\n\ndef flow2rgb(flow: np.ndarray,\n             color_wheel: Optional[np.ndarray] = None,\n             unknown_thr: float = 1e6) -> np.ndarray:\n    \"\"\"Convert flow map to RGB image.\n\n    Args:\n        flow (ndarray): Array of optical flow.\n        color_wheel (ndarray or None): Color wheel used to map flow field to\n            RGB colorspace. Default color wheel will be used if not specified.\n        unknown_thr (float): Values above this threshold will be marked as\n            unknown and thus ignored.\n\n    Returns:\n        ndarray: RGB image that can be visualized.\n    \"\"\"\n    assert flow.ndim == 3 and flow.shape[-1] == 2\n    if color_wheel is None:\n        color_wheel = make_color_wheel()\n    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3\n    num_bins = color_wheel.shape[0]\n\n    dx = flow[:, :, 0].copy()\n    dy = flow[:, :, 1].copy()\n\n    ignore_inds = (\n        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |\n        (np.abs(dy) > unknown_thr))\n    dx[ignore_inds] = 0\n    dy[ignore_inds] = 0\n\n    rad = np.sqrt(dx**2 + dy**2)\n    if np.any(rad > np.finfo(float).eps):\n        max_rad = np.max(rad)\n        dx /= max_rad\n        dy /= max_rad\n\n    rad = np.sqrt(dx**2 + dy**2)\n    angle = np.arctan2(-dy, -dx) / np.pi\n\n    bin_real = (angle + 1) / 2 * (num_bins - 1)\n    bin_left = np.floor(bin_real).astype(int)\n    bin_right = (bin_left + 1) % num_bins\n    w = (bin_real - bin_left.astype(np.float32))[..., None]\n    flow_img = (1 -\n                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]\n    small_ind = rad <= 1\n    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])\n    flow_img[np.logical_not(small_ind)] *= 0.75\n\n    flow_img[ignore_inds, :] = 0\n\n    return flow_img\n\n\ndef make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:\n    \"\"\"Build a color wheel.\n\n    Args:\n        bins(list or tuple, optional): Specify the number of bins for each\n            color range, corresponding to six ranges: red -> yellow,\n            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,\n            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default\n            (see Middlebury).\n\n    Returns:\n        ndarray: Color wheel of shape (total_bins, 3).\n    \"\"\"\n    if bins is None:\n        bins = [15, 6, 4, 11, 13, 6]\n    assert len(bins) == 6\n\n    RY, YG, GC, CB, BM, MR = tuple(bins)\n\n    ry = [1, np.arange(RY) / RY, 0]\n    yg = [1 - np.arange(YG) / YG, 1, 0]\n    gc = [0, 1, np.arange(GC) / GC]\n    cb = [0, 1 - np.arange(CB) / CB, 1]\n    bm = [np.arange(BM) / BM, 0, 1]\n    mr = [1, 0, 1 - np.arange(MR) / MR]\n\n    num_bins = RY + YG + GC + CB + BM + MR\n\n    color_wheel = np.zeros((3, num_bins), dtype=np.float32)\n\n    col = 0\n    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):\n        for j in range(3):\n            color_wheel[j, col:col + bins[i]] = color[j]\n        col += bins[i]\n\n    return color_wheel.T\n"
  },
  {
    "path": "requirements/build.txt",
    "content": "pytest-runner\n"
  },
  {
    "path": "requirements/docs.txt",
    "content": "docutils==0.16.0\nmarkdown>=3.4.0\nmyst-parser\nopencv-python\n-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme\nsphinx==4.0.2\nsphinx-copybutton\nsphinx_markdown_tables>=0.0.16\ntorch\nurllib3<2.0.0\n"
  },
  {
    "path": "requirements/optional.txt",
    "content": "ninja\npsutil\n"
  },
  {
    "path": "requirements/runtime.txt",
    "content": "addict\nmmengine>=0.3.0\nnumpy\npackaging\nPillow\npyyaml\nregex;sys_platform=='win32'\nyapf\n"
  },
  {
    "path": "requirements/test.txt",
    "content": "coverage\nlmdb\nonnx\nonnxoptimizer\nonnxruntime\npytest\nPyTurboJPEG\nscipy\ntifffile\n"
  },
  {
    "path": "requirements.txt",
    "content": "-r requirements/build.txt\n-r requirements/optional.txt\n-r requirements/runtime.txt\n-r requirements/test.txt\n"
  },
  {
    "path": "setup.cfg",
    "content": "[bdist_wheel]\nuniversal=1\n\n[aliases]\ntest=pytest\n\n[yapf]\nbased_on_style = pep8\nblank_line_before_nested_class_or_def = true\nsplit_before_expression_after_opening_paren = true\n\n[isort]\nline_length = 79\nmulti_line_output = 0\nextra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc\nknown_first_party = mmcv\nknown_third_party = addict,cv2,matplotlib,numpy,onnx,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,torch,torchvision,yaml,yapf\nno_lines_before = STDLIB,LOCALFOLDER\ndefault_section = THIRDPARTY\n\n# ignore-words-list needs to be lowercase format. For example, if we want to\n# ignore word \"BA\", then we need to append \"ba\" to ignore-words-list rather\n# than \"BA\"\n[codespell]\nquiet-level = 3\nignore-words-list = inout,hist,ba,ro,inh\n"
  },
  {
    "path": "setup.py",
    "content": "import glob\nimport os\nimport platform\nimport re\nfrom pkg_resources import DistributionNotFound, get_distribution, parse_version\nfrom setuptools import find_packages, setup\n\nEXT_TYPE = ''\ntry:\n    import torch\n    if torch.__version__ == 'parrots':\n        from parrots.utils.build_extension import BuildExtension\n        EXT_TYPE = 'parrots'\n    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \\\n            os.getenv('FORCE_MLU', '0') == '1':\n        from torch_mlu.utils.cpp_extension import BuildExtension\n        EXT_TYPE = 'pytorch'\n    elif (hasattr(torch, 'is_musa_available') and torch.is_musa_available()) \\\n            or os.getenv('FORCE_MUSA', '0') == '1':\n        from torch_musa.utils.musa_extension import BuildExtension\n        EXT_TYPE = 'pytorch'\n    else:\n        from torch.utils.cpp_extension import BuildExtension\n        EXT_TYPE = 'pytorch'\n    cmd_class = {'build_ext': BuildExtension}\nexcept ModuleNotFoundError:\n    cmd_class = {}\n    print('Skip building ext ops due to the absence of torch.')\n\n\ndef choose_requirement(primary, secondary):\n    \"\"\"If some version of primary requirement installed, return primary, else\n    return secondary.\"\"\"\n    try:\n        name = re.split(r'[!<>=]', primary)[0]\n        get_distribution(name)\n    except DistributionNotFound:\n        return secondary\n\n    return str(primary)\n\n\ndef get_version():\n    version_file = 'mmcv/version.py'\n    with open(version_file, encoding='utf-8') as f:\n        exec(compile(f.read(), version_file, 'exec'))\n    return locals()['__version__']\n\n\ndef parse_requirements(fname='requirements/runtime.txt', with_version=True):\n    \"\"\"Parse the package dependencies listed in a requirements file but strips\n    specific versioning information.\n\n    Args:\n        fname (str): path to requirements file\n        with_version (bool, default=False): if True include version specs\n\n    Returns:\n        List[str]: list of requirements items\n\n    CommandLine:\n        python -c \"import setup; print(setup.parse_requirements())\"\n    \"\"\"\n    import sys\n    from os.path import exists\n    require_fpath = fname\n\n    def parse_line(line):\n        \"\"\"Parse information from a line in a requirements text file.\"\"\"\n        if line.startswith('-r '):\n            # Allow specifying requirements in other files\n            target = line.split(' ')[1]\n            for info in parse_require_file(target):\n                yield info\n        else:\n            info = {'line': line}\n            if line.startswith('-e '):\n                info['package'] = line.split('#egg=')[1]\n            else:\n                # Remove versioning from the package\n                pat = '(' + '|'.join(['>=', '==', '>']) + ')'\n                parts = re.split(pat, line, maxsplit=1)\n                parts = [p.strip() for p in parts]\n\n                info['package'] = parts[0]\n                if len(parts) > 1:\n                    op, rest = parts[1:]\n                    if ';' in rest:\n                        # Handle platform specific dependencies\n                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies\n                        version, platform_deps = map(str.strip,\n                                                     rest.split(';'))\n                        info['platform_deps'] = platform_deps\n                    else:\n                        version = rest  # NOQA\n                    info['version'] = (op, version)\n            yield info\n\n    def parse_require_file(fpath):\n        with open(fpath) as f:\n            for line in f.readlines():\n                line = line.strip()\n                if line and not line.startswith('#'):\n                    yield from parse_line(line)\n\n    def gen_packages_items():\n        if exists(require_fpath):\n            for info in parse_require_file(require_fpath):\n                parts = [info['package']]\n                if with_version and 'version' in info:\n                    parts.extend(info['version'])\n                if not sys.version.startswith('3.4'):\n                    # apparently package_deps are broken in 3.4\n                    platform_deps = info.get('platform_deps')\n                    if platform_deps is not None:\n                        parts.append(';' + platform_deps)\n                item = ''.join(parts)\n                yield item\n\n    packages = list(gen_packages_items())\n    return packages\n\n\ninstall_requires = parse_requirements()\n\ntry:\n    # OpenCV installed via conda.\n    import cv2  # NOQA: F401\n    major, minor, *rest = cv2.__version__.split('.')\n    if int(major) < 3:\n        raise RuntimeError(\n            f'OpenCV >=3 is required but {cv2.__version__} is installed')\nexcept ImportError:\n    # If first not installed install second package\n    CHOOSE_INSTALL_REQUIRES = [('opencv-python-headless>=3',\n                                'opencv-python>=3')]\n    for main, secondary in CHOOSE_INSTALL_REQUIRES:\n        install_requires.append(choose_requirement(main, secondary))\n\n\ndef get_extensions():\n    extensions = []\n\n    if os.getenv('MMCV_WITH_OPS', '1') == '0':\n        return extensions\n\n    if EXT_TYPE == 'parrots':\n        ext_name = 'mmcv._ext'\n        from parrots.utils.build_extension import Extension\n\n        # new parrots op impl do not use MMCV_USE_PARROTS\n        # define_macros = [('MMCV_USE_PARROTS', None)]\n        define_macros = []\n        include_dirs = []\n        op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\\\n            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\\\n            glob.glob('./mmcv/ops/csrc/parrots/*.cpp')\n        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))\n        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu')\n        op_files.remove('./mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp')\n        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu')\n        cuda_args = os.getenv('MMCV_CUDA_ARGS')\n        extra_compile_args = {\n            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],\n            'cxx': ['-std=c++14'],\n        }\n        if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':\n            define_macros += [('MMCV_WITH_CUDA', None)]\n            extra_compile_args['nvcc'] += [\n                '-D__CUDA_NO_HALF_OPERATORS__',\n                '-D__CUDA_NO_HALF_CONVERSIONS__',\n                '-D__CUDA_NO_HALF2_OPERATORS__',\n            ]\n        ext_ops = Extension(\n            name=ext_name,\n            sources=op_files,\n            include_dirs=include_dirs,\n            define_macros=define_macros,\n            extra_compile_args=extra_compile_args,\n            cuda=True,\n            pytorch=True)\n        extensions.append(ext_ops)\n    elif EXT_TYPE == 'pytorch':\n        ext_name = 'mmcv._ext'\n        from torch.utils.cpp_extension import CppExtension, CUDAExtension\n\n        # prevent ninja from using too many resources\n        try:\n            import psutil\n            num_cpu = len(psutil.Process().cpu_affinity())\n            cpu_use = max(4, num_cpu - 1)\n        except (ModuleNotFoundError, AttributeError):\n            cpu_use = 4\n\n        os.environ.setdefault('MAX_JOBS', str(cpu_use))\n        define_macros = []\n\n        # Before PyTorch1.8.0, when compiling CUDA code, `cxx` is a\n        # required key passed to PyTorch. Even if there is no flag passed\n        # to cxx, users also need to pass an empty list to PyTorch.\n        # Since PyTorch1.8.0, it has a default value so users do not need\n        # to pass an empty list anymore.\n        # More details at https://github.com/pytorch/pytorch/pull/45956\n        extra_compile_args = {'cxx': []}\n\n        if platform.system() != 'Windows':\n            if parse_version(torch.__version__) <= parse_version('1.12.1'):\n                extra_compile_args['cxx'] = ['-std=c++14']\n            else:\n                extra_compile_args['cxx'] = ['-std=c++17']\n        else:\n            if parse_version(torch.__version__) <= parse_version('1.12.1'):\n                extra_compile_args['cxx'] = ['/std:c++14']\n            else:\n                extra_compile_args['cxx'] = ['/std:c++17']\n\n        include_dirs = []\n        library_dirs = []\n        libraries = []\n\n        extra_objects = []\n        extra_link_args = []\n        is_rocm_pytorch = False\n        try:\n            from torch.utils.cpp_extension import ROCM_HOME\n            is_rocm_pytorch = True if ((torch.version.hip is not None) and\n                                       (ROCM_HOME is not None)) else False\n        except ImportError:\n            pass\n\n        if os.getenv('MMCV_WITH_DIOPI', '0') == '1':\n            import mmengine  # NOQA: F401\n            from mmengine.utils.version_utils import digit_version\n            assert digit_version(mmengine.__version__) >= digit_version(\n                '0.7.4'), f'mmengine >= 0.7.4 is required \\\n                but {mmengine.__version__} is installed'\n\n            print(f'Compiling {ext_name} with CPU and DIPU')\n            define_macros += [('MMCV_WITH_DIOPI', None)]\n            define_macros += [('DIOPI_ATTR_WEAK', None)]\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')\n            extension = CppExtension\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            dipu_root = os.getenv('DIPU_ROOT')\n            diopi_path = os.getenv('DIOPI_PATH')\n            dipu_path = os.getenv('DIPU_PATH')\n            vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS')\n            nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS')\n            pytorch_dir = os.getenv('PYTORCH_DIR')\n            include_dirs.append(dipu_root)\n            include_dirs.append(diopi_path + '/include')\n            include_dirs.append(dipu_path + '/dist/include')\n            include_dirs.append(vendor_include_dirs)\n            include_dirs.append(pytorch_dir + 'torch/include')\n            if nccl_include_dirs:\n                include_dirs.append(nccl_include_dirs)\n            library_dirs += [dipu_root]\n            libraries += ['torch_dipu']\n        elif is_rocm_pytorch or torch.cuda.is_available() or os.getenv(\n                'FORCE_CUDA', '0') == '1':\n            if is_rocm_pytorch:\n                define_macros += [('MMCV_WITH_HIP', None)]\n            define_macros += [('MMCV_WITH_CUDA', None)]\n            cuda_args = os.getenv('MMCV_CUDA_ARGS')\n            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')\n            extension = CUDAExtension\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))\n        elif (hasattr(torch, 'is_mlu_available') and\n                torch.is_mlu_available()) or \\\n                os.getenv('FORCE_MLU', '0') == '1':\n            from torch_mlu.utils.cpp_extension import MLUExtension\n\n            def get_mluops_version(file_path):\n                with open(file_path) as f:\n                    for line in f:\n                        if re.search('MLUOP_MAJOR', line):\n                            major = line.strip().split(' ')[2]\n                        if re.search('MLUOP_MINOR', line):\n                            minor = line.strip().split(' ')[2]\n                        if re.search('MLUOP_PATCHLEVEL', line):\n                            patchlevel = line.strip().split(' ')[2]\n                mluops_version = f'v{major}.{minor}.{patchlevel}'\n                return mluops_version\n\n            mmcv_mluops_version = get_mluops_version(\n                './mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h')\n            mlu_ops_path = os.getenv('MMCV_MLU_OPS_PATH')\n            if mlu_ops_path:\n                exists_mluops_version = get_mluops_version(\n                    mlu_ops_path + '/bangc-ops/mlu_op.h')\n                if exists_mluops_version != mmcv_mluops_version:\n                    print('the version of mlu-ops provided is %s,'\n                          ' while %s is needed.' %\n                          (exists_mluops_version, mmcv_mluops_version))\n                    exit()\n                try:\n                    if os.path.exists('mlu-ops'):\n                        if os.path.islink('mlu-ops'):\n                            os.remove('mlu-ops')\n                            os.symlink(mlu_ops_path, 'mlu-ops')\n                        elif os.path.abspath('mlu-ops') != mlu_ops_path:\n                            os.symlink(mlu_ops_path, 'mlu-ops')\n                    else:\n                        os.symlink(mlu_ops_path, 'mlu-ops')\n                except Exception:\n                    raise FileExistsError(\n                        'mlu-ops already exists, please move it out,'\n                        'or rename or remove it.')\n            else:\n                if not os.path.exists('mlu-ops'):\n                    import requests\n                    mluops_url = 'https://github.com/Cambricon/mlu-ops/' + \\\n                        'archive/refs/tags/' + mmcv_mluops_version + '.zip'\n                    req = requests.get(mluops_url)\n                    with open('./mlu-ops.zip', 'wb') as f:\n                        try:\n                            f.write(req.content)\n                        except Exception:\n                            raise ImportError('failed to download mlu-ops')\n\n                    from zipfile import BadZipFile, ZipFile\n                    with ZipFile('./mlu-ops.zip', 'r') as archive:\n                        try:\n                            archive.extractall()\n                            dir_name = archive.namelist()[0].split('/')[0]\n                            os.rename(dir_name, 'mlu-ops')\n                        except BadZipFile:\n                            print('invalid mlu-ops.zip file')\n                else:\n                    exists_mluops_version = get_mluops_version(\n                        './mlu-ops/bangc-ops/mlu_op.h')\n                    if exists_mluops_version != mmcv_mluops_version:\n                        print('the version of provided mlu-ops is %s,'\n                              ' while %s is needed.' %\n                              (exists_mluops_version, mmcv_mluops_version))\n                        exit()\n\n            define_macros += [('MMCV_WITH_MLU', None)]\n            mlu_args = os.getenv('MMCV_MLU_ARGS', '-DNDEBUG ')\n            mluops_includes = []\n            mluops_includes.append('-I' +\n                                   os.path.abspath('./mlu-ops/bangc-ops'))\n            mluops_includes.append(\n                '-I' + os.path.abspath('./mlu-ops/bangc-ops/kernels'))\n            extra_compile_args['cncc'] = [mlu_args] + \\\n                mluops_includes if mlu_args else mluops_includes\n            extra_compile_args['cxx'] += ['-fno-gnu-unique']\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu') + \\\n                glob.glob(\n                    './mlu-ops/bangc-ops/core/**/*.cpp', recursive=True) + \\\n                glob.glob(\n                    './mlu-ops/bangc-ops/kernels/**/*.cpp', recursive=True) + \\\n                glob.glob(\n                    './mlu-ops/bangc-ops/kernels/**/*.mlu', recursive=True)\n            extra_link_args = [\n                '-Wl,--whole-archive',\n                './mlu-ops/bangc-ops/kernels/kernel_wrapper/lib/libextops.a',\n                '-Wl,--no-whole-archive'\n            ]\n            extension = MLUExtension\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))\n            include_dirs.append(os.path.abspath('./mlu-ops/bangc-ops'))\n        elif (hasattr(torch.backends, 'mps')\n              and torch.backends.mps.is_available()) or os.getenv(\n                  'FORCE_MPS', '0') == '1':\n            # objc compiler support\n            from distutils.unixccompiler import UnixCCompiler\n            if '.mm' not in UnixCCompiler.src_extensions:\n                UnixCCompiler.src_extensions.append('.mm')\n                UnixCCompiler.language_map['.mm'] = 'objc'\n\n            define_macros += [('MMCV_WITH_MPS', None)]\n            extra_compile_args = {}\n            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']\n            extra_compile_args['cxx'] += [\n                '-framework', 'Metal', '-framework', 'Foundation'\n            ]\n            extra_compile_args['cxx'] += ['-ObjC++']\n            # src\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')\n            # TODO: support mps ops on torch>=2.1.0\n            if parse_version(torch.__version__) < parse_version('2.1.0'):\n                op_files += glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \\\n                    glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')\n            extension = CppExtension\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))\n        elif (os.getenv('FORCE_NPU', '0') == '1'):\n            print(f'Compiling {ext_name} only with CPU and NPU')\n            try:\n                import importlib\n\n                from torch_npu.utils.cpp_extension import NpuExtension\n                extra_compile_args['cxx'] += [\n                    '-D__FILENAME__=\\\"$$(notdir $$(abspath $$<))\\\"'\n                ]\n                extra_compile_args['cxx'] += [\n                    '-I' + importlib.util.find_spec(\n                        'torch_npu').submodule_search_locations[0] +\n                    '/include/third_party/acl/inc'\n                ]\n                extra_compile_args['cxx'] += [\n                    '-I' + importlib.util.find_spec(\n                        'torch_npu').submodule_search_locations[0] +\n                    '/include/third_party/hccl/inc'\n                ]\n                define_macros += [('MMCV_WITH_NPU', None)]\n                extension = NpuExtension\n                if parse_version(torch.__version__) < parse_version('2.1.0'):\n                    define_macros += [('MMCV_WITH_XLA', None)]\n                if parse_version(torch.__version__) >= parse_version('2.1.0'):\n                    define_macros += [('MMCV_WITH_KPRIVATE', None)]\n            except Exception:\n                raise ImportError('can not find any torch_npu')\n            # src\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))\n        elif hasattr(torch, 'musa') or os.getenv('FORCE_MUSA', '0') == '1':\n            from torch_musa.testing import get_musa_arch\n            from torch_musa.utils.musa_extension import MUSAExtension\n            define_macros += [('MMCV_WITH_MUSA', None),\n                              ('MUSA_ARCH', str(get_musa_arch()))]\n            os.environ['MUSA_ARCH'] = str(get_musa_arch())\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/musa/*.mu') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/musa/*.cpp')\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/musa'))\n            extension = MUSAExtension\n        else:\n            print(f'Compiling {ext_name} only with CPU')\n            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \\\n                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')\n            extension = CppExtension\n            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))\n\n        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses\n        # c++14 features, the argument ['std=c++14'] must be added here.\n        # However, in the windows environment, some standard libraries\n        # will depend on c++17 or higher. In fact, for the windows\n        # environment, the compiler will choose the appropriate compiler\n        # to compile those cpp files, so there is no need to add the\n        # argument\n        if 'nvcc' in extra_compile_args and platform.system() != 'Windows':\n            if parse_version(torch.__version__) <= parse_version('1.12.1'):\n                extra_compile_args['nvcc'] += ['-std=c++14']\n            else:\n                extra_compile_args['nvcc'] += ['-std=c++17']\n\n        ext_ops = extension(\n            name=ext_name,\n            sources=op_files,\n            include_dirs=include_dirs,\n            define_macros=define_macros,\n            extra_objects=extra_objects,\n            extra_compile_args=extra_compile_args,\n            library_dirs=library_dirs,\n            libraries=libraries,\n            extra_link_args=extra_link_args)\n        extensions.append(ext_ops)\n    return extensions\n\n\nsetup(\n    name='mmcv' if os.getenv('MMCV_WITH_OPS', '1') == '1' else 'mmcv-lite',\n    version=get_version(),\n    description='OpenMMLab Computer Vision Foundation',\n    keywords='computer vision',\n    packages=find_packages(),\n    include_package_data=True,\n    classifiers=[\n        'Development Status :: 4 - Beta',\n        'License :: OSI Approved :: Apache Software License',\n        'Operating System :: OS Independent',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.7',\n        'Programming Language :: Python :: 3.8',\n        'Programming Language :: Python :: 3.9',\n        'Programming Language :: Python :: 3.10',\n        'Topic :: Utilities',\n    ],\n    url='https://github.com/open-mmlab/mmcv',\n    author='MMCV Contributors',\n    author_email='openmmlab@gmail.com',\n    install_requires=install_requires,\n    extras_require={\n        'all': parse_requirements('requirements.txt'),\n        'tests': parse_requirements('requirements/test.txt'),\n        'build': parse_requirements('requirements/build.txt'),\n        'optional': parse_requirements('requirements/optional.txt'),\n    },\n    python_requires='>=3.7',\n    ext_modules=get_extensions(),\n    cmdclass=cmd_class,\n    zip_safe=False)\n"
  },
  {
    "path": "tests/test_arraymisc.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\nimport numpy as np\nimport pytest\n\nimport mmcv\n\n\ndef test_quantize():\n    arr = np.random.randn(10, 10)\n    levels = 20\n\n    qarr = mmcv.quantize(arr, -1, 1, levels)\n    assert qarr.shape == arr.shape\n    assert qarr.dtype == np.dtype('int64')\n    for i in range(arr.shape[0]):\n        for j in range(arr.shape[1]):\n            ref = min(levels - 1,\n                      int(np.floor(10 * (1 + max(min(arr[i, j], 1), -1)))))\n            assert qarr[i, j] == ref\n\n    qarr = mmcv.quantize(arr, -1, 1, 20, dtype=np.uint8)\n    assert qarr.shape == arr.shape\n    assert qarr.dtype == np.dtype('uint8')\n\n    with pytest.raises(ValueError):\n        mmcv.quantize(arr, -1, 1, levels=0)\n    with pytest.raises(ValueError):\n        mmcv.quantize(arr, -1, 1, levels=10.0)\n    with pytest.raises(ValueError):\n        mmcv.quantize(arr, 2, 1, levels)\n\n\ndef test_dequantize():\n    levels = 20\n    qarr = np.random.randint(levels, size=(10, 10))\n\n    arr = mmcv.dequantize(qarr, -1, 1, levels)\n    assert arr.shape == qarr.shape\n    assert arr.dtype == np.dtype('float64')\n    for i in range(qarr.shape[0]):\n        for j in range(qarr.shape[1]):\n            assert arr[i, j] == (qarr[i, j] + 0.5) / 10 - 1\n\n    arr = mmcv.dequantize(qarr, -1, 1, levels, dtype=np.float32)\n    assert arr.shape == qarr.shape\n    assert arr.dtype == np.dtype('float32')\n\n    with pytest.raises(ValueError):\n        mmcv.dequantize(arr, -1, 1, levels=0)\n    with pytest.raises(ValueError):\n        mmcv.dequantize(arr, -1, 1, levels=10.0)\n    with pytest.raises(ValueError):\n        mmcv.dequantize(arr, 2, 1, levels)\n\n\ndef test_joint():\n    arr = np.random.randn(100, 100)\n    levels = 1000\n    qarr = mmcv.quantize(arr, -1, 1, levels)\n    recover = mmcv.dequantize(qarr, -1, 1, levels)\n    assert np.abs(recover[arr < -1] + 0.999).max() < 1e-6\n    assert np.abs(recover[arr > 1] - 0.999).max() < 1e-6\n    assert np.abs((recover - arr)[(arr >= -1) & (arr <= 1)]).max() <= 1e-3\n\n    arr = np.clip(np.random.randn(100) / 1000, -0.01, 0.01)\n    levels = 99\n    qarr = mmcv.quantize(arr, -1, 1, levels)\n    recover = mmcv.dequantize(qarr, -1, 1, levels)\n    assert np.all(recover == 0)\n"
  },
  {
    "path": "tests/test_cnn/test_build_layers.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport inspect\nfrom importlib import import_module\n\nimport numpy as np\nimport pytest\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\nfrom mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm\nfrom torch.nn import ReflectionPad2d, Upsample\n\nfrom mmcv.cnn.bricks import (ContextBlock, ConvModule, ConvTranspose2d,\n                             GeneralizedAttention, NonLocal2d,\n                             build_activation_layer, build_conv_layer,\n                             build_norm_layer, build_padding_layer,\n                             build_plugin_layer, build_upsample_layer, is_norm)\nfrom mmcv.cnn.bricks.activation import Clamp\nfrom mmcv.cnn.bricks.norm import infer_abbr as infer_norm_abbr\nfrom mmcv.cnn.bricks.plugin import infer_abbr as infer_plugin_abbr\nfrom mmcv.cnn.bricks.upsample import PixelShufflePack\n\n\ndef test_build_conv_layer():\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'Conv2d'\n        build_conv_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict(kernel_size=3)\n        build_conv_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # unsupported conv type\n        cfg = dict(type='FancyConv')\n        build_conv_layer(cfg)\n\n    kwargs = dict(\n        in_channels=4, out_channels=8, kernel_size=3, groups=2, dilation=2)\n    cfg = None\n    layer = build_conv_layer(cfg, **kwargs)\n    assert isinstance(layer, nn.Conv2d)\n    assert layer.in_channels == kwargs['in_channels']\n    assert layer.out_channels == kwargs['out_channels']\n    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])\n    assert layer.groups == kwargs['groups']\n    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])\n\n    cfg = dict(type='Conv')\n    layer = build_conv_layer(cfg, **kwargs)\n    assert isinstance(layer, nn.Conv2d)\n    assert layer.in_channels == kwargs['in_channels']\n    assert layer.out_channels == kwargs['out_channels']\n    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])\n    assert layer.groups == kwargs['groups']\n    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])\n\n    cfg = dict(type='deconv')\n    layer = build_conv_layer(cfg, **kwargs)\n    assert isinstance(layer, nn.ConvTranspose2d)\n    assert layer.in_channels == kwargs['in_channels']\n    assert layer.out_channels == kwargs['out_channels']\n    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])\n    assert layer.groups == kwargs['groups']\n    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])\n\n    # sparse convs cannot support the case when groups>1\n    kwargs.pop('groups')\n\n    for type_name, module in MODELS.module_dict.items():\n        for type_name_ in (type_name, module):\n            cfg = dict(type=type_name_)\n            # SparseInverseConv2d and SparseInverseConv3d do not have the\n            # argument 'dilation'\n            if type_name == 'SparseInverseConv2d' or type_name == \\\n                    'SparseInverseConv3d':\n                kwargs.pop('dilation')\n            if 'conv' in type_name.lower():\n                layer = build_conv_layer(cfg, **kwargs)\n                assert isinstance(layer, module)\n                assert layer.in_channels == kwargs['in_channels']\n                assert layer.out_channels == kwargs['out_channels']\n                kwargs['dilation'] = 2  # recover the key\n\n\ndef test_infer_norm_abbr():\n    with pytest.raises(TypeError):\n        # class_type must be a class\n        infer_norm_abbr(0)\n\n    class MyNorm:\n\n        _abbr_ = 'mn'\n\n    assert infer_norm_abbr(MyNorm) == 'mn'\n\n    class FancyBatchNorm:\n        pass\n\n    assert infer_norm_abbr(FancyBatchNorm) == 'bn'\n\n    class FancyInstanceNorm:\n        pass\n\n    assert infer_norm_abbr(FancyInstanceNorm) == 'in'\n\n    class FancyLayerNorm:\n        pass\n\n    assert infer_norm_abbr(FancyLayerNorm) == 'ln'\n\n    class FancyGroupNorm:\n        pass\n\n    assert infer_norm_abbr(FancyGroupNorm) == 'gn'\n\n    class FancyNorm:\n        pass\n\n    assert infer_norm_abbr(FancyNorm) == 'norm_layer'\n\n\ndef test_build_norm_layer():\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'BN'\n        build_norm_layer(cfg, 3)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict()\n        build_norm_layer(cfg, 3)\n\n    with pytest.raises(KeyError):\n        # unsupported norm type\n        cfg = dict(type='FancyNorm')\n        build_norm_layer(cfg, 3)\n\n    with pytest.raises(AssertionError):\n        # postfix must be int or str\n        cfg = dict(type='BN')\n        build_norm_layer(cfg, 3, postfix=[1, 2])\n\n    with pytest.raises(AssertionError):\n        # `num_groups` must be in cfg when using 'GN'\n        cfg = dict(type='GN')\n        build_norm_layer(cfg, 3)\n\n    # test each type of norm layer in norm_cfg\n    abbr_mapping = {\n        'BN': 'bn',\n        'BN1d': 'bn',\n        'BN2d': 'bn',\n        'BN3d': 'bn',\n        'SyncBN': 'bn',\n        'GN': 'gn',\n        'LN': 'ln',\n        'IN': 'in',\n        'IN1d': 'in',\n        'IN2d': 'in',\n        'IN3d': 'in',\n    }\n    for type_name, module in MODELS.module_dict.items():\n        if type_name not in abbr_mapping:\n            continue\n        if type_name == 'MMSyncBN':  # skip MMSyncBN\n            continue\n        for postfix in ['_test', 1]:\n            for type_name_ in (type_name, module):\n                cfg = dict(type=type_name_)\n                if type_name == 'GN':\n                    cfg['num_groups'] = 3\n                name, layer = build_norm_layer(cfg, 3, postfix=postfix)\n                assert name == abbr_mapping[type_name] + str(postfix)\n                assert isinstance(layer, module)\n                if type_name == 'GN':\n                    assert layer.num_channels == 3\n                    assert layer.num_groups == cfg['num_groups']\n                elif type_name != 'LN':\n                    assert layer.num_features == 3\n\n\ndef test_build_activation_layer():\n    act_names = [\n        'ReLU', 'LeakyReLU', 'PReLU', 'RReLU', 'ReLU6', 'ELU', 'Sigmoid',\n        'Tanh'\n    ]\n\n    for module_name in ['activation', 'hsigmoid', 'hswish', 'swish']:\n        act_module = import_module(f'mmcv.cnn.bricks.{module_name}')\n        for key, value in act_module.__dict__.items():\n            if inspect.isclass(value) and issubclass(value, nn.Module):\n                act_names.append(key)\n\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'ReLU'\n        build_activation_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict()\n        build_activation_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # unsupported activation type\n        cfg = dict(type='FancyReLU')\n        build_activation_layer(cfg)\n\n    # test each type of activation layer in activation_cfg\n    for type_name, module in MODELS.module_dict.items():\n        if type_name in act_names:\n            cfg['type'] = type_name\n            layer = build_activation_layer(cfg)\n            assert isinstance(layer, module)\n\n    # sanity check for Clamp\n    for type_name in ('Clamp', Clamp):\n        act = build_activation_layer(dict(type='Clamp'))\n        x = torch.randn(10) * 1000\n        y = act(x)\n        assert np.logical_and((y >= -1).numpy(), (y <= 1).numpy()).all()\n\n    act = build_activation_layer(dict(type='Clip', min=0))\n    y = act(x)\n    assert np.logical_and((y >= 0).numpy(), (y <= 1).numpy()).all()\n    act = build_activation_layer(dict(type='Clamp', max=0))\n    y = act(x)\n    assert np.logical_and((y >= -1).numpy(), (y <= 0).numpy()).all()\n\n\ndef test_build_padding_layer():\n    pad_names = ['zero', 'reflect', 'replicate']\n    for module_name in ['padding']:\n        pad_module = import_module(f'mmcv.cnn.bricks.{module_name}')\n        for key, value in pad_module.__dict__.items():\n            if inspect.isclass(value) and issubclass(value, nn.Module):\n                pad_names.append(key)\n\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'reflect'\n        build_padding_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict()\n        build_padding_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # unsupported activation type\n        cfg = dict(type='FancyPad')\n        build_padding_layer(cfg)\n\n    for type_name, module in MODELS.module_dict.items():\n        if type_name in pad_names:\n            cfg['type'] = type_name\n            layer = build_padding_layer(cfg, 2)\n            assert isinstance(layer, module)\n    for type_name in (ReflectionPad2d, 'reflect'):\n        input_x = torch.randn(1, 2, 5, 5)\n        cfg = dict(type=type_name)\n        padding_layer = build_padding_layer(cfg, 2)\n        res = padding_layer(input_x)\n        assert res.shape == (1, 2, 9, 9)\n\n\ndef test_upsample_layer():\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'bilinear'\n        build_upsample_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict()\n        build_upsample_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # unsupported activation type\n        cfg = dict(type='FancyUpsample')\n        build_upsample_layer(cfg)\n\n    for type_name in ['nearest', 'bilinear']:\n        cfg['type'] = type_name\n        layer = build_upsample_layer(cfg)\n        assert isinstance(layer, nn.Upsample)\n        assert layer.mode == type_name\n\n    cfg = dict()\n    cfg['type'] = Upsample\n    layer_from_cls = build_upsample_layer(cfg)\n    assert isinstance(layer_from_cls, nn.Upsample)\n    assert layer_from_cls.mode == 'nearest'\n\n    cfg = dict(\n        type='deconv', in_channels=3, out_channels=3, kernel_size=3, stride=2)\n    layer = build_upsample_layer(cfg)\n    assert isinstance(layer, nn.ConvTranspose2d)\n\n    for type_name in ('deconv', ConvTranspose2d):\n        cfg = dict(type=ConvTranspose2d)\n        kwargs = dict(in_channels=3, out_channels=3, kernel_size=3, stride=2)\n        layer = build_upsample_layer(cfg, **kwargs)\n        assert isinstance(layer, nn.ConvTranspose2d)\n        assert layer.in_channels == kwargs['in_channels']\n        assert layer.out_channels == kwargs['out_channels']\n        assert layer.kernel_size == (kwargs['kernel_size'],\n                                     kwargs['kernel_size'])\n        assert layer.stride == (kwargs['stride'], kwargs['stride'])\n\n        layer = build_upsample_layer(cfg, 3, 3, 3, 2)\n        assert isinstance(layer, nn.ConvTranspose2d)\n        assert layer.in_channels == kwargs['in_channels']\n        assert layer.out_channels == kwargs['out_channels']\n        assert layer.kernel_size == (kwargs['kernel_size'],\n                                     kwargs['kernel_size'])\n        assert layer.stride == (kwargs['stride'], kwargs['stride'])\n\n    for type_name in ('pixel_shuffle', PixelShufflePack):\n        cfg = dict(\n            type=type_name,\n            in_channels=3,\n            out_channels=3,\n            scale_factor=2,\n            upsample_kernel=3)\n        layer = build_upsample_layer(cfg)\n\n        assert isinstance(layer, PixelShufflePack)\n        assert layer.scale_factor == 2\n        assert layer.upsample_kernel == 3\n\n\ndef test_pixel_shuffle_pack():\n    x_in = torch.rand(2, 3, 10, 10)\n    pixel_shuffle = PixelShufflePack(3, 3, scale_factor=2, upsample_kernel=3)\n    assert pixel_shuffle.upsample_conv.kernel_size == (3, 3)\n    x_out = pixel_shuffle(x_in)\n    assert x_out.shape == (2, 3, 20, 20)\n\n\ndef test_is_norm():\n    norm_set1 = [\n        nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.InstanceNorm1d,\n        nn.InstanceNorm2d, nn.InstanceNorm3d, nn.LayerNorm\n    ]\n    norm_set2 = [nn.GroupNorm]\n    for norm_type in norm_set1:\n        layer = norm_type(3)\n        assert is_norm(layer)\n        assert not is_norm(layer, exclude=(norm_type, ))\n    for norm_type in norm_set2:\n        layer = norm_type(3, 6)\n        assert is_norm(layer)\n        assert not is_norm(layer, exclude=(norm_type, ))\n\n    class MyNorm(nn.BatchNorm2d):\n        pass\n\n    layer = MyNorm(3)\n    assert is_norm(layer)\n    assert not is_norm(layer, exclude=_BatchNorm)\n    assert not is_norm(layer, exclude=(_BatchNorm, ))\n\n    layer = nn.Conv2d(3, 8, 1)\n    assert not is_norm(layer)\n\n    with pytest.raises(TypeError):\n        layer = nn.BatchNorm1d(3)\n        is_norm(layer, exclude='BN')\n\n    with pytest.raises(TypeError):\n        layer = nn.BatchNorm1d(3)\n        is_norm(layer, exclude=('BN', ))\n\n\ndef test_infer_plugin_abbr():\n    with pytest.raises(TypeError):\n        # class_type must be a class\n        infer_plugin_abbr(0)\n\n    class MyPlugin:\n\n        _abbr_ = 'mp'\n\n    assert infer_plugin_abbr(MyPlugin) == 'mp'\n\n    class FancyPlugin:\n        pass\n\n    assert infer_plugin_abbr(FancyPlugin) == 'fancy_plugin'\n\n\ndef test_build_plugin_layer():\n    with pytest.raises(TypeError):\n        # cfg must be a dict\n        cfg = 'Plugin'\n        build_plugin_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # `type` must be in cfg\n        cfg = dict()\n        build_plugin_layer(cfg)\n\n    with pytest.raises(KeyError):\n        # unsupported plugin type\n        cfg = dict(type='FancyPlugin')\n        build_plugin_layer(cfg)\n\n    with pytest.raises(AssertionError):\n        # postfix must be int or str\n        cfg = dict(type='ConvModule')\n        build_plugin_layer(cfg, postfix=[1, 2])\n\n    # test ContextBlock\n    for type_name in ('ContextBlock', ContextBlock):\n        for postfix in ['', '_test', 1]:\n            cfg = dict(type=type_name)\n            name, layer = build_plugin_layer(\n                cfg, postfix=postfix, in_channels=16, ratio=1. / 4)\n            assert name == 'context_block' + str(postfix)\n            assert isinstance(layer, MODELS.module_dict['ContextBlock'])\n\n    # test GeneralizedAttention\n    for type_name in ('GeneralizedAttention', GeneralizedAttention):\n        for postfix in ['', '_test', 1]:\n            cfg = dict(type=type_name)\n            name, layer = build_plugin_layer(\n                cfg, postfix=postfix, in_channels=16)\n            assert name == 'gen_attention_block' + str(postfix)\n            assert isinstance(layer,\n                              MODELS.module_dict['GeneralizedAttention'])\n\n    # test NonLocal2d\n    for type_name in ('NonLocal2d', NonLocal2d):\n        for postfix in ['', '_test', 1]:\n            cfg = dict(type='NonLocal2d')\n            name, layer = build_plugin_layer(\n                cfg, postfix=postfix, in_channels=16)\n            assert name == 'nonlocal_block' + str(postfix)\n            assert isinstance(layer, MODELS.module_dict['NonLocal2d'])\n\n    # test ConvModule\n    for postfix in ['', '_test', 1]:\n        for type_name in ('ConvModule', ConvModule):\n            cfg = dict(type=type_name)\n            name, layer = build_plugin_layer(\n                cfg,\n                postfix=postfix,\n                in_channels=16,\n                out_channels=4,\n                kernel_size=3)\n            assert name == 'conv_block' + str(postfix)\n            assert isinstance(layer, MODELS.module_dict['ConvModule'])\n"
  },
  {
    "path": "tests/test_cnn/test_context_block.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.cnn.bricks import ContextBlock\n\n\ndef test_context_block():\n    with pytest.raises(AssertionError):\n        # pooling_type should be in ['att', 'avg']\n        ContextBlock(16, 1. / 4, pooling_type='unsupport_type')\n\n    with pytest.raises(AssertionError):\n        # fusion_types should be of type list or tuple\n        ContextBlock(16, 1. / 4, fusion_types='unsupport_type')\n\n    with pytest.raises(AssertionError):\n        # fusion_types should be in ['channel_add', 'channel_mul']\n        ContextBlock(16, 1. / 4, fusion_types=('unsupport_type', ))\n\n    # test pooling_type='att'\n    imgs = torch.randn(2, 16, 20, 20)\n    context_block = ContextBlock(16, 1. / 4, pooling_type='att')\n    out = context_block(imgs)\n    assert context_block.conv_mask.in_channels == 16\n    assert context_block.conv_mask.out_channels == 1\n    assert out.shape == imgs.shape\n\n    # test pooling_type='avg'\n    imgs = torch.randn(2, 16, 20, 20)\n    context_block = ContextBlock(16, 1. / 4, pooling_type='avg')\n    out = context_block(imgs)\n    assert hasattr(context_block, 'avg_pool')\n    assert out.shape == imgs.shape\n\n    # test fusion_types=('channel_add',)\n    imgs = torch.randn(2, 16, 20, 20)\n    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_add', ))\n    out = context_block(imgs)\n    assert context_block.channel_add_conv is not None\n    assert context_block.channel_mul_conv is None\n    assert out.shape == imgs.shape\n\n    # test fusion_types=('channel_mul',)\n    imgs = torch.randn(2, 16, 20, 20)\n    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_mul', ))\n    out = context_block(imgs)\n    assert context_block.channel_add_conv is None\n    assert context_block.channel_mul_conv is not None\n    assert out.shape == imgs.shape\n\n    # test fusion_types=('channel_add', 'channel_mul')\n    imgs = torch.randn(2, 16, 20, 20)\n    context_block = ContextBlock(\n        16, 1. / 4, fusion_types=('channel_add', 'channel_mul'))\n    out = context_block(imgs)\n    assert context_block.channel_add_conv is not None\n    assert context_block.channel_mul_conv is not None\n    assert out.shape == imgs.shape\n"
  },
  {
    "path": "tests/test_cnn/test_conv2d_adaptive_padding.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom mmcv.cnn.bricks import Conv2dAdaptivePadding\n\n\ndef test_conv2d_samepadding():\n    # test Conv2dAdaptivePadding with stride=1\n    inputs = torch.rand((1, 3, 28, 28))\n    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)\n    output = conv(inputs)\n    assert output.shape == inputs.shape\n\n    inputs = torch.rand((1, 3, 13, 13))\n    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)\n    output = conv(inputs)\n    assert output.shape == inputs.shape\n\n    # test Conv2dAdaptivePadding with stride=2\n    inputs = torch.rand((1, 3, 28, 28))\n    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)\n    output = conv(inputs)\n    assert output.shape == torch.Size([1, 3, 14, 14])\n\n    inputs = torch.rand((1, 3, 13, 13))\n    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)\n    output = conv(inputs)\n    assert output.shape == torch.Size([1, 3, 7, 7])\n"
  },
  {
    "path": "tests/test_cnn/test_conv_module.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\nfrom unittest.mock import patch\n\nimport pytest\nimport torch\nimport torch.nn as nn\nfrom mmengine.registry import MODELS\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfrom mmcv.cnn.bricks import ConvModule, HSigmoid, HSwish\n\n\n@MODELS.register_module()\nclass ExampleConv(nn.Module):\n\n    def __init__(self,\n                 in_channels,\n                 out_channels,\n                 kernel_size,\n                 stride=1,\n                 padding=0,\n                 dilation=1,\n                 groups=1,\n                 bias=True,\n                 norm_cfg=None):\n        super().__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.kernel_size = kernel_size\n        self.stride = stride\n        self.padding = padding\n        self.dilation = dilation\n        self.groups = groups\n        self.bias = bias\n        self.norm_cfg = norm_cfg\n        self.output_padding = (0, 0, 0)\n        self.transposed = False\n\n        self.conv0 = nn.Conv2d(in_channels, out_channels, kernel_size)\n        self.init_weights()\n\n    def forward(self, x):\n        x = self.conv0(x)\n        return x\n\n    def init_weights(self):\n        nn.init.constant_(self.conv0.weight, 0)\n\n\ndef test_conv_module():\n    with pytest.raises(AssertionError):\n        # conv_cfg must be a dict or None\n        conv_cfg = 'conv'\n        ConvModule(3, 8, 2, conv_cfg=conv_cfg)\n\n    with pytest.raises(AssertionError):\n        # norm_cfg must be a dict or None\n        norm_cfg = 'norm'\n        ConvModule(3, 8, 2, norm_cfg=norm_cfg)\n\n    with pytest.raises(KeyError):\n        # softmax is not supported\n        act_cfg = dict(type='softmax')\n        ConvModule(3, 8, 2, act_cfg=act_cfg)\n\n    # conv + norm + act\n    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    assert conv.with_activation\n    assert hasattr(conv, 'activate')\n    assert conv.with_norm\n    assert hasattr(conv, 'norm')\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # conv + norm with efficient mode\n    efficient_conv = ConvModule(\n        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()\n    plain_conv = ConvModule(\n        3, 8, 2, norm_cfg=dict(type='BN'),\n        efficient_conv_bn_eval=False).eval()\n    for efficient_param, plain_param in zip(\n            efficient_conv.state_dict().values(),\n            plain_conv.state_dict().values()):\n        plain_param.copy_(efficient_param)\n\n    efficient_mode_output = efficient_conv(x)\n    plain_mode_output = plain_conv(x)\n    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)\n\n    # `conv` attribute can be dynamically modified in efficient mode\n    efficient_conv = ConvModule(\n        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()\n    new_conv = nn.Conv2d(3, 8, 2).eval()\n    efficient_conv.conv = new_conv\n    efficient_mode_output = efficient_conv(x)\n    plain_mode_output = efficient_conv.activate(\n        efficient_conv.norm(new_conv(x)))\n    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)\n\n    # conv + act\n    conv = ConvModule(3, 8, 2)\n    assert conv.with_activation\n    assert hasattr(conv, 'activate')\n    assert not conv.with_norm\n    assert conv.norm is None\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # conv\n    conv = ConvModule(3, 8, 2, act_cfg=None)\n    assert not conv.with_norm\n    assert conv.norm is None\n    assert not conv.with_activation\n    assert not hasattr(conv, 'activate')\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # conv with its own `init_weights` method\n    conv_module = ConvModule(\n        3, 8, 2, conv_cfg=dict(type='ExampleConv'), act_cfg=None)\n    assert torch.equal(conv_module.conv.conv0.weight, torch.zeros(8, 3, 2, 2))\n\n    # with_spectral_norm=True\n    conv = ConvModule(3, 8, 3, padding=1, with_spectral_norm=True)\n    assert hasattr(conv.conv, 'weight_orig')\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # padding_mode='reflect'\n    conv = ConvModule(3, 8, 3, padding=1, padding_mode='reflect')\n    assert isinstance(conv.padding_layer, nn.ReflectionPad2d)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # non-existing padding mode\n    with pytest.raises(KeyError):\n        conv = ConvModule(3, 8, 3, padding=1, padding_mode='non_exists')\n\n    # leaky relu\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))\n    assert isinstance(conv.activate, nn.LeakyReLU)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # tanh\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Tanh'))\n    assert isinstance(conv.activate, nn.Tanh)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # Sigmoid\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Sigmoid'))\n    assert isinstance(conv.activate, nn.Sigmoid)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # PReLU\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='PReLU'))\n    assert isinstance(conv.activate, nn.PReLU)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # HSwish\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))\n    if (TORCH_VERSION == 'parrots'\n            or digit_version(TORCH_VERSION) < digit_version('1.7')):\n        assert isinstance(conv.activate, HSwish)\n    else:\n        assert isinstance(conv.activate, nn.Hardswish)\n\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # HSigmoid\n    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSigmoid'))\n    assert isinstance(conv.activate, HSigmoid)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n\ndef test_bias():\n    # bias: auto, without norm\n    conv = ConvModule(3, 8, 2)\n    assert conv.conv.bias is not None\n\n    # bias: auto, with norm\n    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    assert conv.conv.bias is None\n\n    # bias: False, without norm\n    conv = ConvModule(3, 8, 2, bias=False)\n    assert conv.conv.bias is None\n\n    # bias: True, with batch norm\n    with pytest.warns(UserWarning) as record:\n        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='BN'))\n    assert len(record) == 1\n    assert record[0].message.args[\n        0] == 'Unnecessary conv bias before batch/instance norm'\n\n    # bias: True, with instance norm\n    with pytest.warns(UserWarning) as record:\n        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='IN'))\n    assert len(record) == 1\n    assert record[0].message.args[\n        0] == 'Unnecessary conv bias before batch/instance norm'\n\n    # bias: True, with other norm\n    with pytest.warns(UserWarning) as record:\n        norm_cfg = dict(type='GN', num_groups=1)\n        ConvModule(3, 8, 2, bias=True, norm_cfg=norm_cfg)\n        warnings.warn('No warnings')\n    assert len(record) == 1\n    assert record[0].message.args[0] == 'No warnings'\n\n\ndef conv_forward(self, x):\n    return x + '_conv'\n\n\ndef bn_forward(self, x):\n    return x + '_bn'\n\n\ndef relu_forward(self, x):\n    return x + '_relu'\n\n\n@patch('torch.nn.ReLU.forward', relu_forward)\n@patch('torch.nn.BatchNorm2d.forward', bn_forward)\n@patch('torch.nn.Conv2d.forward', conv_forward)\ndef test_order():\n\n    with pytest.raises(AssertionError):\n        # order must be a tuple\n        order = ['conv', 'norm', 'act']\n        ConvModule(3, 8, 2, order=order)\n\n    with pytest.raises(AssertionError):\n        # length of order must be 3\n        order = ('conv', 'norm')\n        ConvModule(3, 8, 2, order=order)\n\n    with pytest.raises(AssertionError):\n        # order must be an order of 'conv', 'norm', 'act'\n        order = ('conv', 'norm', 'norm')\n        ConvModule(3, 8, 2, order=order)\n\n    with pytest.raises(AssertionError):\n        # order must be an order of 'conv', 'norm', 'act'\n        order = ('conv', 'norm', 'something')\n        ConvModule(3, 8, 2, order=order)\n\n    # ('conv', 'norm', 'act')\n    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    out = conv('input')\n    assert out == 'input_conv_bn_relu'\n\n    # ('norm', 'conv', 'act')\n    conv = ConvModule(\n        3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))\n    out = conv('input')\n    assert out == 'input_bn_conv_relu'\n\n    # ('conv', 'norm', 'act'), activate=False\n    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    out = conv('input', activate=False)\n    assert out == 'input_conv_bn'\n\n    # ('conv', 'norm', 'act'), activate=False\n    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    out = conv('input', norm=False)\n    assert out == 'input_conv_relu'\n"
  },
  {
    "path": "tests/test_cnn/test_depthwise_seperable_conv_module.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn.bricks import DepthwiseSeparableConvModule\n\n\ndef test_depthwise_separable_conv():\n    with pytest.raises(AssertionError):\n        # conv_cfg must be a dict or None\n        DepthwiseSeparableConvModule(4, 8, 2, groups=2)\n\n    # test default config\n    conv = DepthwiseSeparableConvModule(3, 8, 2)\n    assert conv.depthwise_conv.conv.groups == 3\n    assert conv.pointwise_conv.conv.kernel_size == (1, 1)\n    assert not conv.depthwise_conv.with_norm\n    assert not conv.pointwise_conv.with_norm\n    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'\n    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # test dw_norm_cfg\n    conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg=dict(type='BN'))\n    assert conv.depthwise_conv.norm_name == 'bn'\n    assert not conv.pointwise_conv.with_norm\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # test pw_norm_cfg\n    conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg=dict(type='BN'))\n    assert not conv.depthwise_conv.with_norm\n    assert conv.pointwise_conv.norm_name == 'bn'\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # test norm_cfg\n    conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg=dict(type='BN'))\n    assert conv.depthwise_conv.norm_name == 'bn'\n    assert conv.pointwise_conv.norm_name == 'bn'\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    # add test for ['norm', 'conv', 'act']\n    conv = DepthwiseSeparableConvModule(3, 8, 2, order=('norm', 'conv', 'act'))\n    x = torch.rand(1, 3, 256, 256)\n    output = conv(x)\n    assert output.shape == (1, 8, 255, 255)\n\n    conv = DepthwiseSeparableConvModule(\n        3, 8, 3, padding=1, with_spectral_norm=True)\n    assert hasattr(conv.depthwise_conv.conv, 'weight_orig')\n    assert hasattr(conv.pointwise_conv.conv, 'weight_orig')\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    conv = DepthwiseSeparableConvModule(\n        3, 8, 3, padding=1, padding_mode='reflect')\n    assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # test dw_act_cfg\n    conv = DepthwiseSeparableConvModule(\n        3, 8, 3, padding=1, dw_act_cfg=dict(type='LeakyReLU'))\n    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'\n    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # test pw_act_cfg\n    conv = DepthwiseSeparableConvModule(\n        3, 8, 3, padding=1, pw_act_cfg=dict(type='LeakyReLU'))\n    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'\n    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n\n    # test act_cfg\n    conv = DepthwiseSeparableConvModule(\n        3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))\n    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'\n    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'\n    output = conv(x)\n    assert output.shape == (1, 8, 256, 256)\n"
  },
  {
    "path": "tests/test_cnn/test_flops_counter.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn import get_model_complexity_info\nfrom mmcv.cnn.utils.flops_counter import flops_to_string, params_to_string\n\ntry:\n    from StringIO import StringIO\nexcept ImportError:\n    from io import StringIO\n\n# yapf: disable\ngt_results = [\n    {'model': nn.Conv1d(3, 8, 3), 'input': (3, 16), 'flops': 1120.0, 'params': 80.0},  # noqa: E501\n    {'model': nn.Conv2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 43904.0, 'params': 224.0},  # noqa: E501\n    {'model': nn.Conv3d(3, 8, 3), 'input': (3, 3, 16, 16), 'flops': 128576.0, 'params': 656.0},  # noqa: E501\n    {'model': nn.ReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.PReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 1},  # noqa: E501\n    {'model': nn.ELU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.LeakyReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.ReLU6(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.MaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501\n    {'model': nn.MaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.MaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501\n    {'model': nn.AvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501\n    {'model': nn.AvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.AvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveMaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveMaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveMaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveAvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveAvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.AdaptiveAvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501\n    {'model': nn.BatchNorm1d(3), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.BatchNorm2d(3), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.BatchNorm3d(3), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.GroupNorm(2, 6), 'input': (6, 16, 16), 'flops': 3072.0, 'params': 12.0},  # noqa: E501\n    {'model': nn.InstanceNorm1d(3, affine=True), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.InstanceNorm2d(3, affine=True), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.InstanceNorm3d(3, affine=True), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501\n    {'model': nn.LayerNorm((3, 16, 16)), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 1536.0},  # noqa: E501\n    {'model': nn.LayerNorm((3, 16, 16), elementwise_affine=False), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501\n    {'model': nn.Linear(1024, 2), 'input': (1024, ), 'flops': 2048.0, 'params': 2050.0},  # noqa: E501\n    {'model': nn.ConvTranspose2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 57888, 'params': 224.0},  # noqa: E501\n    {'model': nn.Upsample((32, 32)), 'input': (3, 16, 16), 'flops': 3072.0, 'params': 0}  # noqa: E501\n]\n# yapf: enable\n\n\nclass ExampleModel(nn.Module):\n\n    def __init__(self):\n        super().__init__()\n        self.conv2d = nn.Conv2d(3, 8, 3)\n\n    def forward(self, imgs):\n        x = torch.randn((1, *imgs))\n        return self.conv2d(x)\n\n\ndef input_constructor(x):\n    return dict(imgs=x)\n\n\ndef test_flops_counter():\n    with pytest.raises(AssertionError):\n        # input_res should be a tuple\n        model = nn.Conv2d(3, 8, 3)\n        input_res = [1, 3, 16, 16]\n        get_model_complexity_info(model, input_res)\n\n    with pytest.raises(AssertionError):\n        # len(input_res) >= 2\n        model = nn.Conv2d(3, 8, 3)\n        input_res = tuple()\n        get_model_complexity_info(model, input_res)\n\n    # test common layers\n    for item in gt_results:\n        model = item['model']\n        input = item['input']\n        flops, params = get_model_complexity_info(\n            model, input, as_strings=False, print_per_layer_stat=False)\n        assert flops == item['flops'] and params == item['params']\n\n    # test input constructor\n    model = ExampleModel()\n    x = (3, 16, 16)\n    flops, params = get_model_complexity_info(\n        model,\n        x,\n        as_strings=False,\n        print_per_layer_stat=False,\n        input_constructor=input_constructor)\n    assert flops == 43904.0 and params == 224.0\n\n    # test output string\n    model = nn.Conv3d(3, 8, 3)\n    x = (3, 3, 512, 512)\n    flops, params = get_model_complexity_info(\n        model, x, print_per_layer_stat=False)\n    assert flops == '0.17 GFLOPs' and params == str(656)\n\n    # test print per layer status\n    model = nn.Conv1d(3, 8, 3)\n    x = (3, 16)\n    out = StringIO()\n    get_model_complexity_info(model, x, ost=out)\n    assert out.getvalue() == \\\n        'Conv1d(0.0 M, 100.000% Params, 0.0 GFLOPs, 100.000% FLOPs, 3, 8, kernel_size=(3,), stride=(1,))\\n'  # noqa: E501\n\n    # test when model is not a common instance\n    model = nn.Sequential(nn.Conv2d(3, 8, 3), nn.Flatten(), nn.Linear(1568, 2))\n    x = (3, 16, 16)\n    flops, params = get_model_complexity_info(\n        model, x, as_strings=False, print_per_layer_stat=True)\n    assert flops == 47040.0 and params == 3362\n\n\ndef test_flops_to_string():\n    flops = 6.54321 * 10.**9\n    assert flops_to_string(flops) == '6.54 GFLOPs'\n    assert flops_to_string(flops, 'MFLOPs') == '6543.21 MFLOPs'\n    assert flops_to_string(flops, 'KFLOPs') == '6543210.0 KFLOPs'\n    assert flops_to_string(flops, 'FLOPs') == '6543210000.0 FLOPs'\n    assert flops_to_string(flops, precision=4) == '6.5432 GFLOPs'\n\n    flops = 6.54321 * 10.**9\n    assert flops_to_string(flops, None) == '6.54 GFLOPs'\n    flops = 3.21 * 10.**7\n    assert flops_to_string(flops, None) == '32.1 MFLOPs'\n    flops = 5.4 * 10.**3\n    assert flops_to_string(flops, None) == '5.4 KFLOPs'\n    flops = 987\n    assert flops_to_string(flops, None) == '987 FLOPs'\n\n\ndef test_params_to_string():\n    num_params = 3.21 * 10.**7\n    assert params_to_string(num_params) == '32.1 M'\n    num_params = 4.56 * 10.**5\n    assert params_to_string(num_params) == '456.0 k'\n    num_params = 7.89 * 10.**2\n    assert params_to_string(num_params) == '789.0'\n\n    num_params = 6.54321 * 10.**7\n    assert params_to_string(num_params, 'M') == '65.43 M'\n    assert params_to_string(num_params, 'K') == '65432.1 K'\n    assert params_to_string(num_params, '') == '65432100.0'\n    assert params_to_string(num_params, precision=4) == '65.4321 M'\n"
  },
  {
    "path": "tests/test_cnn/test_fuse_conv_bn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn import ConvModule, fuse_conv_bn\n\n\ndef test_fuse_conv_bn():\n    inputs = torch.rand((1, 3, 5, 5))\n    modules = nn.ModuleList()\n    modules.append(nn.BatchNorm2d(3))\n    modules.append(ConvModule(3, 5, 3, norm_cfg=dict(type='BN')))\n    modules.append(ConvModule(5, 5, 3, norm_cfg=dict(type='BN')))\n    modules = nn.Sequential(*modules)\n    fused_modules = fuse_conv_bn(modules)\n    assert torch.equal(modules(inputs), fused_modules(inputs))\n"
  },
  {
    "path": "tests/test_cnn/test_generalized_attention.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom mmcv.cnn.bricks import GeneralizedAttention\n\n\ndef test_context_block():\n\n    # test attention_type='1000'\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, attention_type='1000')\n    assert gen_attention_block.query_conv.in_channels == 16\n    assert gen_attention_block.key_conv.in_channels == 16\n    assert gen_attention_block.key_conv.in_channels == 16\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test attention_type='0100'\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, attention_type='0100')\n    assert gen_attention_block.query_conv.in_channels == 16\n    assert gen_attention_block.appr_geom_fc_x.in_features == 8\n    assert gen_attention_block.appr_geom_fc_y.in_features == 8\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test attention_type='0010'\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, attention_type='0010')\n    assert gen_attention_block.key_conv.in_channels == 16\n    assert hasattr(gen_attention_block, 'appr_bias')\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test attention_type='0001'\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, attention_type='0001')\n    assert gen_attention_block.appr_geom_fc_x.in_features == 8\n    assert gen_attention_block.appr_geom_fc_y.in_features == 8\n    assert hasattr(gen_attention_block, 'geom_bias')\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test spatial_range >= 0\n    imgs = torch.randn(2, 256, 20, 20)\n    gen_attention_block = GeneralizedAttention(256, spatial_range=10)\n    assert hasattr(gen_attention_block, 'local_constraint_map')\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test q_stride > 1\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, q_stride=2)\n    assert gen_attention_block.q_downsample is not None\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test kv_stride > 1\n    imgs = torch.randn(2, 16, 20, 20)\n    gen_attention_block = GeneralizedAttention(16, kv_stride=2)\n    assert gen_attention_block.kv_downsample is not None\n    out = gen_attention_block(imgs)\n    assert out.shape == imgs.shape\n\n    # test fp16 with attention_type='1111'\n    if torch.cuda.is_available():\n        imgs = torch.randn(2, 16, 20, 20).cuda().to(torch.half)\n        gen_attention_block = GeneralizedAttention(\n            16,\n            spatial_range=-1,\n            num_heads=8,\n            attention_type='1111',\n            kv_stride=2)\n        gen_attention_block.cuda().type(torch.half)\n        out = gen_attention_block(imgs)\n        assert out.shape == imgs.shape\n"
  },
  {
    "path": "tests/test_cnn/test_hsigmoid.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.cnn.bricks import HSigmoid\n\n\ndef test_hsigmoid():\n    # test assertion divisor can not be zero\n    with pytest.raises(AssertionError):\n        HSigmoid(divisor=0)\n\n    # test with default parameters\n    act = HSigmoid()\n    input_shape = torch.Size([1, 3, 64, 64])\n    input = torch.randn(input_shape)\n    output = act(input)\n    expected_output = torch.min(\n        torch.max((input + 3) / 6, torch.zeros(input_shape)),\n        torch.ones(input_shape))\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.equal(output, expected_output)\n\n    # test with designated parameters\n    act = HSigmoid(1, 2, 0, 1)\n    input_shape = torch.Size([1, 3, 64, 64])\n    input = torch.randn(input_shape)\n    output = act(input)\n    expected_output = torch.min(\n        torch.max((input + 1) / 2, torch.zeros(input_shape)),\n        torch.ones(input_shape))\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.equal(output, expected_output)\n"
  },
  {
    "path": "tests/test_cnn/test_hswish.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nfrom torch.nn.functional import relu6\n\nfrom mmcv.cnn.bricks import HSwish\n\n\ndef test_hswish():\n    # test inplace\n    act = HSwish(inplace=True)\n    assert act.act.inplace\n    act = HSwish()\n    assert not act.act.inplace\n\n    input = torch.randn(1, 3, 64, 64)\n    expected_output = input * relu6(input + 3) / 6\n    output = act(input)\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.equal(output, expected_output)\n"
  },
  {
    "path": "tests/test_cnn/test_non_local.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn import NonLocal1d, NonLocal2d, NonLocal3d\nfrom mmcv.cnn.bricks.non_local import _NonLocalNd\n\n\ndef test_nonlocal():\n    with pytest.raises(ValueError):\n        # mode should be in ['embedded_gaussian', 'dot_product']\n        _NonLocalNd(3, mode='unsupport_mode')\n\n    # _NonLocalNd with zero initialization\n    _NonLocalNd(3)\n    _NonLocalNd(3, norm_cfg=dict(type='BN'))\n\n    # _NonLocalNd without zero initialization\n    _NonLocalNd(3, zeros_init=False)\n    _NonLocalNd(3, norm_cfg=dict(type='BN'), zeros_init=False)\n\n\ndef test_nonlocal3d():\n    # NonLocal3d with 'embedded_gaussian' mode\n    imgs = torch.randn(2, 3, 10, 20, 20)\n    nonlocal_3d = NonLocal3d(3)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            # NonLocal is only implemented on gpu in parrots\n            imgs = imgs.cuda()\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal3d with 'dot_product' mode\n    nonlocal_3d = NonLocal3d(3, mode='dot_product')\n    assert nonlocal_3d.mode == 'dot_product'\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal3d with 'concatenation' mode\n    nonlocal_3d = NonLocal3d(3, mode='concatenation')\n    assert nonlocal_3d.mode == 'concatenation'\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal3d with 'gaussian' mode\n    nonlocal_3d = NonLocal3d(3, mode='gaussian')\n    assert not hasattr(nonlocal_3d, 'phi')\n    assert nonlocal_3d.mode == 'gaussian'\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal3d with 'gaussian' mode and sub_sample\n    nonlocal_3d = NonLocal3d(3, mode='gaussian', sub_sample=True)\n    assert isinstance(nonlocal_3d.g, nn.Sequential) and len(nonlocal_3d.g) == 2\n    assert isinstance(nonlocal_3d.g[1], nn.MaxPool3d)\n    assert nonlocal_3d.g[1].kernel_size == (1, 2, 2)\n    assert isinstance(nonlocal_3d.phi, nn.MaxPool3d)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal3d with 'dot_product' mode and sub_sample\n    nonlocal_3d = NonLocal3d(3, mode='dot_product', sub_sample=True)\n    for m in [nonlocal_3d.g, nonlocal_3d.phi]:\n        assert isinstance(m, nn.Sequential) and len(m) == 2\n        assert isinstance(m[1], nn.MaxPool3d)\n        assert m[1].kernel_size == (1, 2, 2)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_3d.cuda()\n    out = nonlocal_3d(imgs)\n    assert out.shape == imgs.shape\n\n\ndef test_nonlocal2d():\n    # NonLocal2d with 'embedded_gaussian' mode\n    imgs = torch.randn(2, 3, 20, 20)\n    nonlocal_2d = NonLocal2d(3)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal2d with 'dot_product' mode\n    imgs = torch.randn(2, 3, 20, 20)\n    nonlocal_2d = NonLocal2d(3, mode='dot_product')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal2d with 'concatenation' mode\n    imgs = torch.randn(2, 3, 20, 20)\n    nonlocal_2d = NonLocal2d(3, mode='concatenation')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal2d with 'gaussian' mode\n    imgs = torch.randn(2, 3, 20, 20)\n    nonlocal_2d = NonLocal2d(3, mode='gaussian')\n    assert not hasattr(nonlocal_2d, 'phi')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal2d with 'gaussian' mode and sub_sample\n    nonlocal_2d = NonLocal2d(3, mode='gaussian', sub_sample=True)\n    assert isinstance(nonlocal_2d.g, nn.Sequential) and len(nonlocal_2d.g) == 2\n    assert isinstance(nonlocal_2d.g[1], nn.MaxPool2d)\n    assert nonlocal_2d.g[1].kernel_size == (2, 2)\n    assert isinstance(nonlocal_2d.phi, nn.MaxPool2d)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal2d with 'dot_product' mode and sub_sample\n    nonlocal_2d = NonLocal2d(3, mode='dot_product', sub_sample=True)\n    for m in [nonlocal_2d.g, nonlocal_2d.phi]:\n        assert isinstance(m, nn.Sequential) and len(m) == 2\n        assert isinstance(m[1], nn.MaxPool2d)\n        assert m[1].kernel_size == (2, 2)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_2d.cuda()\n    out = nonlocal_2d(imgs)\n    assert out.shape == imgs.shape\n\n\ndef test_nonlocal1d():\n    # NonLocal1d with 'embedded_gaussian' mode\n    imgs = torch.randn(2, 3, 20)\n    nonlocal_1d = NonLocal1d(3)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal1d with 'dot_product' mode\n    imgs = torch.randn(2, 3, 20)\n    nonlocal_1d = NonLocal1d(3, mode='dot_product')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal1d with 'concatenation' mode\n    imgs = torch.randn(2, 3, 20)\n    nonlocal_1d = NonLocal1d(3, mode='concatenation')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal1d with 'gaussian' mode\n    imgs = torch.randn(2, 3, 20)\n    nonlocal_1d = NonLocal1d(3, mode='gaussian')\n    assert not hasattr(nonlocal_1d, 'phi')\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            imgs = imgs.cuda()\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal1d with 'gaussian' mode and sub_sample\n    nonlocal_1d = NonLocal1d(3, mode='gaussian', sub_sample=True)\n    assert isinstance(nonlocal_1d.g, nn.Sequential) and len(nonlocal_1d.g) == 2\n    assert isinstance(nonlocal_1d.g[1], nn.MaxPool1d)\n    assert nonlocal_1d.g[1].kernel_size == 2\n    assert isinstance(nonlocal_1d.phi, nn.MaxPool1d)\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n\n    # NonLocal1d with 'dot_product' mode and sub_sample\n    nonlocal_1d = NonLocal1d(3, mode='dot_product', sub_sample=True)\n    for m in [nonlocal_1d.g, nonlocal_1d.phi]:\n        assert isinstance(m, nn.Sequential) and len(m) == 2\n        assert isinstance(m[1], nn.MaxPool1d)\n        assert m[1].kernel_size == 2\n    if torch.__version__ == 'parrots':\n        if torch.cuda.is_available():\n            nonlocal_1d.cuda()\n    out = nonlocal_1d(imgs)\n    assert out.shape == imgs.shape\n"
  },
  {
    "path": "tests/test_cnn/test_rfsearch/test_operator.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom copy import deepcopy\n\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.cnn.rfsearch.operator import Conv2dRFSearchOp\n\nglobal_config = dict(\n    step=0,\n    max_step=12,\n    search_interval=1,\n    exp_rate=0.5,\n    init_alphas=0.01,\n    mmin=1,\n    mmax=24,\n    num_branches=2,\n    skip_layer=['stem', 'layer1'])\n\n\n# test with 3x3 conv\ndef test_rfsearch_operator_3x3():\n    conv = nn.Conv2d(\n        in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1)\n    operator = Conv2dRFSearchOp(conv, global_config)\n    x = torch.randn(1, 3, 32, 32)\n\n    # set no_grad to perform in-place operator\n    with torch.no_grad():\n        # After expand: (1, 1) (2, 2)\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (2, 2)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After estimate: (2, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (2, 2)\n        assert operator.op_layer.dilation == (2, 2)\n        assert operator.op_layer.padding == (2, 2)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (3, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.1\n        operator.branch_weights[1] = 0.4\n        # After estimate: (3, 3) with branch_weights of [0.2 0.8]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (3, 3)\n        assert operator.op_layer.dilation == (3, 3)\n        assert operator.op_layer.padding == (3, 3)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n\n# test with 5x5 conv\ndef test_rfsearch_operator_5x5():\n    conv = nn.Conv2d(\n        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)\n    operator = Conv2dRFSearchOp(conv, global_config)\n    x = torch.randn(1, 3, 32, 32)\n\n    with torch.no_grad():\n        # After expand: (1, 1) (2, 2)\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (2, 2)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After estimate: (2, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (2, 2)\n        assert operator.op_layer.dilation == (2, 2)\n        assert operator.op_layer.padding == (4, 4)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (3, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.1\n        operator.branch_weights[1] = 0.4\n        # After estimate: (3, 3) with branch_weights of [0.2 0.8]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (3, 3)\n        assert operator.op_layer.dilation == (3, 3)\n        assert operator.op_layer.padding == (6, 6)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n\n# test with 5x5 conv num_branches=3\ndef test_rfsearch_operator_5x5_branch3():\n    conv = nn.Conv2d(\n        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)\n    config = deepcopy(global_config)\n    config['num_branches'] = 3\n    operator = Conv2dRFSearchOp(conv, config)\n    x = torch.randn(1, 3, 32, 32)\n\n    with torch.no_grad():\n        # After expand: (1, 1) (2, 2)\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (2, 2)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After estimate: (2, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (2, 2)\n        assert operator.op_layer.dilation == (2, 2)\n        assert operator.op_layer.padding == (4, 4)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (2, 2) (3, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 3\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (2, 2)\n        assert operator.dilation_rates[2] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.1\n        operator.branch_weights[1] = 0.3\n        operator.branch_weights[2] = 0.6\n        # After estimate: (3, 3) with branch_weights of [0.1 0.3 0.6]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (3, 3)\n        assert operator.op_layer.dilation == (3, 3)\n        assert operator.op_layer.padding == (6, 6)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n\n# test with 1x5 conv\ndef test_rfsearch_operator_1x5():\n    conv = nn.Conv2d(\n        in_channels=3,\n        out_channels=3,\n        kernel_size=(1, 5),\n        stride=1,\n        padding=(0, 2))\n    operator = Conv2dRFSearchOp(conv, global_config)\n    x = torch.randn(1, 3, 32, 32)\n\n    # After expand: (1, 1) (1, 2)\n    assert len(operator.dilation_rates) == 2\n    assert operator.dilation_rates[0] == (1, 1)\n    assert operator.dilation_rates[1] == (1, 2)\n    assert torch.all(\n        operator.branch_weights.data == global_config['init_alphas']).item()\n    # test forward\n    assert operator(x).shape == (1, 3, 32, 32)\n\n    with torch.no_grad():\n        # After estimate: (1, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (1, 2)\n        assert operator.op_layer.dilation == (1, 2)\n        assert operator.op_layer.padding == (0, 4)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (1, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (1, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.2\n        operator.branch_weights[1] = 0.8\n        # After estimate: (3, 3) with branch_weights of [0.2 0.8]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (1, 3)\n        assert operator.op_layer.dilation == (1, 3)\n        assert operator.op_layer.padding == (0, 6)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n\n# test with 5x5 conv initial_dilation=(2, 2)\ndef test_rfsearch_operator_5x5_d2x2():\n    conv = nn.Conv2d(\n        in_channels=3,\n        out_channels=3,\n        kernel_size=5,\n        stride=1,\n        padding=4,\n        dilation=(2, 2))\n    operator = Conv2dRFSearchOp(conv, global_config)\n    x = torch.randn(1, 3, 32, 32)\n\n    with torch.no_grad():\n        # After expand: (1, 1) (3, 3)\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After estimate: (2, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (2, 2)\n        assert operator.op_layer.dilation == (2, 2)\n        assert operator.op_layer.padding == (4, 4)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (3, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.8\n        operator.branch_weights[1] = 0.2\n        # After estimate: (3, 3) with branch_weights of [0.8 0.2]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.op_layer.dilation == (1, 1)\n        assert operator.op_layer.padding == (2, 2)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n\n# test with 5x5 conv initial_dilation=(1, 2)\ndef test_rfsearch_operator_5x5_d1x2():\n    conv = nn.Conv2d(\n        in_channels=3,\n        out_channels=3,\n        kernel_size=5,\n        stride=1,\n        padding=(2, 4),\n        dilation=(1, 2))\n    operator = Conv2dRFSearchOp(conv, global_config)\n    x = torch.randn(1, 3, 32, 32)\n\n    with torch.no_grad():\n        # After expand: (1, 1) (2, 3)\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (2, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After estimate: (2, 2) with branch_weights of [0.5 0.5]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (2, 2)\n        assert operator.op_layer.dilation == (2, 2)\n        assert operator.op_layer.padding == (4, 4)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        # After expand: (1, 1) (3, 3)\n        operator.expand_rates()\n        assert len(operator.dilation_rates) == 2\n        assert operator.dilation_rates[0] == (1, 1)\n        assert operator.dilation_rates[1] == (3, 3)\n        assert torch.all(operator.branch_weights.data ==\n                         global_config['init_alphas']).item()\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n\n        operator.branch_weights[0] = 0.1\n        operator.branch_weights[1] = 0.8\n        # After estimate: (3, 3) with branch_weights of [0.1 0.8]\n        operator.estimate_rates()\n        assert len(operator.dilation_rates) == 1\n        assert operator.dilation_rates[0] == (3, 3)\n        assert operator.op_layer.dilation == (3, 3)\n        assert operator.op_layer.padding == (6, 6)\n        # test forward\n        assert operator(x).shape == (1, 3, 32, 32)\n"
  },
  {
    "path": "tests/test_cnn/test_rfsearch/test_search.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\nimport torch.nn as nn\n\nfrom mmcv.cnn.rfsearch import Conv2dRFSearchOp, RFSearchHook\n\n\ndef test_rfsearchhook():\n\n    def conv(in_channels, out_channels, kernel_size, stride, padding,\n             dilation):\n        return nn.Conv2d(\n            in_channels=in_channels,\n            out_channels=out_channels,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation)\n\n    class Model(nn.Module):\n\n        def __init__(self):\n            super().__init__()\n            self.stem = conv(1, 2, 3, 1, 1, 1)\n            self.conv0 = conv(2, 2, 3, 1, 1, 1)\n            self.layer0 = nn.Sequential(\n                conv(2, 2, 3, 1, 1, 1), conv(2, 2, 3, 1, 1, 1))\n            self.conv1 = conv(2, 2, 1, 1, 0, 1)\n            self.conv2 = conv(2, 2, 3, 1, 1, 1)\n            self.conv3 = conv(2, 2, (1, 3), 1, (0, 1), 1)\n\n        def forward(self, x):\n            x1 = self.stem(x)\n            x2 = self.layer0(x1)\n            x3 = self.conv0(x2)\n            x4 = self.conv1(x3)\n            x5 = self.conv2(x4)\n            x6 = self.conv3(x5)\n            return x6\n\n        def train_step(self, x, optimizer, **kwargs):\n            return dict(loss=self(x).mean(), num_samples=x.shape[0])\n\n    rfsearch_cfg = dict(\n        mode='search',\n        rfstructure_file=None,\n        config=dict(\n            search=dict(\n                step=0,\n                max_step=12,\n                search_interval=1,\n                exp_rate=0.5,\n                init_alphas=0.01,\n                mmin=1,\n                mmax=24,\n                num_branches=2,\n                skip_layer=['stem', 'conv0', 'layer0.1'])),\n    )\n\n    # hook for search\n    rfsearchhook_search = RFSearchHook(\n        'search', rfsearch_cfg['config'], by_epoch=True, verbose=True)\n    rfsearchhook_search.config['structure'] = {\n        'module.layer0.0': [1, 1],\n        'module.conv2': [2, 2],\n        'module.conv3': [1, 1]\n    }\n    # hook for fixed_single_branch\n    rfsearchhook_fixed_single_branch = RFSearchHook(\n        'fixed_single_branch',\n        rfsearch_cfg['config'],\n        by_epoch=True,\n        verbose=True)\n    rfsearchhook_fixed_single_branch.config['structure'] = {\n        'module.layer0.0': [1, 1],\n        'module.conv2': [2, 2],\n        'module.conv3': [1, 1]\n    }\n    # hook for fixed_multi_branch\n    rfsearchhook_fixed_multi_branch = RFSearchHook(\n        'fixed_multi_branch',\n        rfsearch_cfg['config'],\n        by_epoch=True,\n        verbose=True)\n    rfsearchhook_fixed_multi_branch.config['structure'] = {\n        'module.layer0.0': [1, 1],\n        'module.conv2': [2, 2],\n        'module.conv3': [1, 1]\n    }\n\n    def test_skip_layer():\n        assert not isinstance(model.stem, Conv2dRFSearchOp)\n        assert not isinstance(model.conv0, Conv2dRFSearchOp)\n        assert isinstance(model.layer0[0], Conv2dRFSearchOp)\n        assert not isinstance(model.layer0[1], Conv2dRFSearchOp)\n\n    # 1. test init_model() with mode of search\n    model = Model()\n    rfsearchhook_search.init_model(model)\n\n    test_skip_layer()\n    assert not isinstance(model.conv1, Conv2dRFSearchOp)\n    assert isinstance(model.conv2, Conv2dRFSearchOp)\n    assert isinstance(model.conv3, Conv2dRFSearchOp)\n    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]\n    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]\n\n    # 2. test init_model() with mode of fixed_single_branch\n    model = Model()\n    rfsearchhook_fixed_single_branch.init_model(model)\n\n    assert not isinstance(model.conv1, Conv2dRFSearchOp)\n    assert not isinstance(model.conv2, Conv2dRFSearchOp)\n    assert not isinstance(model.conv3, Conv2dRFSearchOp)\n    assert model.conv1.dilation == (1, 1)\n    assert model.conv2.dilation == (2, 2)\n    assert model.conv3.dilation == (1, 1)\n\n    # 3. test init_model() with mode of fixed_multi_branch\n    model = Model()\n    rfsearchhook_fixed_multi_branch.init_model(model)\n\n    test_skip_layer()\n    assert not isinstance(model.conv1, Conv2dRFSearchOp)\n    assert isinstance(model.conv2, Conv2dRFSearchOp)\n    assert isinstance(model.conv3, Conv2dRFSearchOp)\n    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]\n    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]\n"
  },
  {
    "path": "tests/test_cnn/test_scale.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.cnn.bricks import LayerScale, Scale\n\n\ndef test_scale():\n    # test default scale\n    scale = Scale()\n    assert scale.scale.data == 1.\n    assert scale.scale.dtype == torch.float\n    x = torch.rand(1, 3, 64, 64)\n    output = scale(x)\n    assert output.shape == (1, 3, 64, 64)\n\n    # test given scale\n    scale = Scale(10.)\n    assert scale.scale.data == 10.\n    assert scale.scale.dtype == torch.float\n    x = torch.rand(1, 3, 64, 64)\n    output = scale(x)\n    assert output.shape == (1, 3, 64, 64)\n\n\ndef test_layer_scale():\n    with pytest.raises(AssertionError):\n        cfg = dict(\n            dim=10,\n            data_format='BNC',\n        )\n        LayerScale(**cfg)\n\n    # test init\n    cfg = dict(dim=10)\n    ls = LayerScale(**cfg)\n    assert torch.equal(ls.weight, torch.ones(10, requires_grad=True) * 1e-5)\n\n    # test forward\n    # test channels_last\n    cfg = dict(dim=256, inplace=False, data_format='channels_last')\n    ls_channels_last = LayerScale(**cfg)\n    x = torch.randn((4, 49, 256))\n    out = ls_channels_last(x)\n    assert tuple(out.size()) == (4, 49, 256)\n    assert torch.equal(x * 1e-5, out)\n\n    # test channels_last 2d\n    cfg = dict(dim=256, inplace=False, data_format='channels_last')\n    ls_channels_last = LayerScale(**cfg)\n    x = torch.randn((4, 7, 49, 256))\n    out = ls_channels_last(x)\n    assert tuple(out.size()) == (4, 7, 49, 256)\n    assert torch.equal(x * 1e-5, out)\n\n    # test channels_first\n    cfg = dict(dim=256, inplace=False, data_format='channels_first')\n    ls_channels_first = LayerScale(**cfg)\n    x = torch.randn((4, 256, 7, 7))\n    out = ls_channels_first(x)\n    assert tuple(out.size()) == (4, 256, 7, 7)\n    assert torch.equal(x * 1e-5, out)\n\n    # test channels_first 3D\n    cfg = dict(dim=256, inplace=False, data_format='channels_first')\n    ls_channels_first = LayerScale(**cfg)\n    x = torch.randn((4, 256, 7, 7, 7))\n    out = ls_channels_first(x)\n    assert tuple(out.size()) == (4, 256, 7, 7, 7)\n    assert torch.equal(x * 1e-5, out)\n\n    # test inplace True\n    cfg = dict(dim=256, inplace=True, data_format='channels_first')\n    ls_channels_first = LayerScale(**cfg)\n    x = torch.randn((4, 256, 7, 7))\n    out = ls_channels_first(x)\n    assert tuple(out.size()) == (4, 256, 7, 7)\n    assert x is out\n"
  },
  {
    "path": "tests/test_cnn/test_silu.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\nfrom mmcv.cnn.bricks import build_activation_layer\n\n\ndef test_silu():\n    act = build_activation_layer(dict(type='SiLU'))\n    input = torch.randn(1, 3, 64, 64)\n    expected_output = input * torch.sigmoid(input)\n    output = act(input)\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.allclose(output, expected_output)\n\n    # test inplace\n    act = build_activation_layer(dict(type='SiLU', inplace=True))\n    assert act.inplace\n    input = torch.randn(1, 3, 64, 64)\n    expected_output = input * torch.sigmoid(input)\n    output = act(input)\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.allclose(output, expected_output)\n    assert torch.allclose(input, expected_output)\n    assert input is output\n"
  },
  {
    "path": "tests/test_cnn/test_swish.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn.functional as F\n\nfrom mmcv.cnn.bricks import Swish\n\n\ndef test_swish():\n    act = Swish()\n    input = torch.randn(1, 3, 64, 64)\n    expected_output = input * F.sigmoid(input)\n    output = act(input)\n    # test output shape\n    assert output.shape == expected_output.shape\n    # test output value\n    assert torch.equal(output, expected_output)\n"
  },
  {
    "path": "tests/test_cnn/test_transformer.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport pytest\nimport torch\nfrom mmengine.model import ModuleList\n\nfrom mmcv.cnn.bricks.drop import DropPath\nfrom mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,\n                                         BaseTransformerLayer,\n                                         MultiheadAttention, PatchEmbed,\n                                         PatchMerging,\n                                         TransformerLayerSequence)\n\n\ndef test_adaptive_padding():\n\n    for padding in ('same', 'corner'):\n        kernel_size = 16\n        stride = 16\n        dilation = 1\n        input = torch.rand(1, 1, 15, 17)\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        out = adap_pad(input)\n        # padding to divisible by 16\n        assert (out.shape[2], out.shape[3]) == (16, 32)\n        input = torch.rand(1, 1, 16, 17)\n        out = adap_pad(input)\n        # padding to divisible by 16\n        assert (out.shape[2], out.shape[3]) == (16, 32)\n\n        kernel_size = (2, 2)\n        stride = (2, 2)\n        dilation = (1, 1)\n\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        input = torch.rand(1, 1, 11, 13)\n        out = adap_pad(input)\n        # padding to divisible by 2\n        assert (out.shape[2], out.shape[3]) == (12, 14)\n\n        kernel_size = (2, 2)\n        stride = (10, 10)\n        dilation = (1, 1)\n\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        input = torch.rand(1, 1, 10, 13)\n        out = adap_pad(input)\n        #  no padding\n        assert (out.shape[2], out.shape[3]) == (10, 13)\n\n        kernel_size = (11, 11)\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        input = torch.rand(1, 1, 11, 13)\n        out = adap_pad(input)\n        #  all padding\n        assert (out.shape[2], out.shape[3]) == (21, 21)\n\n        # test padding as kernel is (7,9)\n        input = torch.rand(1, 1, 11, 13)\n        stride = (3, 4)\n        kernel_size = (4, 5)\n        dilation = (2, 2)\n        # actually (7, 9)\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        dilation_out = adap_pad(input)\n        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)\n        kernel_size = (7, 9)\n        dilation = (1, 1)\n        adap_pad = AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=padding)\n        kernel79_out = adap_pad(input)\n        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)\n        assert kernel79_out.shape == dilation_out.shape\n\n    # assert only support \"same\" \"corner\"\n    with pytest.raises(AssertionError):\n        AdaptivePadding(\n            kernel_size=kernel_size,\n            stride=stride,\n            dilation=dilation,\n            padding=1)\n\n\ndef test_patch_embed():\n    B = 2\n    H = 3\n    W = 4\n    C = 3\n    embed_dims = 10\n    kernel_size = 3\n    stride = 1\n    dummy_input = torch.rand(B, C, H, W)\n    patch_merge_1 = PatchEmbed(\n        in_channels=C,\n        embed_dims=embed_dims,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=0,\n        dilation=1,\n        norm_cfg=None)\n\n    x1, shape = patch_merge_1(dummy_input)\n    # test out shape\n    assert x1.shape == (2, 2, 10)\n    # test outsize is correct\n    assert shape == (1, 2)\n    # test L = out_h * out_w\n    assert shape[0] * shape[1] == x1.shape[1]\n\n    B = 2\n    H = 10\n    W = 10\n    C = 3\n    embed_dims = 10\n    kernel_size = 5\n    stride = 2\n    dummy_input = torch.rand(B, C, H, W)\n    # test dilation\n    patch_merge_2 = PatchEmbed(\n        in_channels=C,\n        embed_dims=embed_dims,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=0,\n        dilation=2,\n        norm_cfg=None,\n    )\n\n    x2, shape = patch_merge_2(dummy_input)\n    # test out shape\n    assert x2.shape == (2, 1, 10)\n    # test outsize is correct\n    assert shape == (1, 1)\n    # test L = out_h * out_w\n    assert shape[0] * shape[1] == x2.shape[1]\n\n    stride = 2\n    input_size = (10, 10)\n\n    dummy_input = torch.rand(B, C, H, W)\n    # test stride and norm\n    patch_merge_3 = PatchEmbed(\n        in_channels=C,\n        embed_dims=embed_dims,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=0,\n        dilation=2,\n        norm_cfg=dict(type='LN'),\n        input_size=input_size)\n\n    x3, shape = patch_merge_3(dummy_input)\n    # test out shape\n    assert x3.shape == (2, 1, 10)\n    # test outsize is correct\n    assert shape == (1, 1)\n    # test L = out_h * out_w\n    assert shape[0] * shape[1] == x3.shape[1]\n\n    # test the init_out_size with nn.Unfold\n    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -\n                                              1) // 2 + 1\n    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -\n                                              1) // 2 + 1\n    H = 11\n    W = 12\n    input_size = (H, W)\n    dummy_input = torch.rand(B, C, H, W)\n    # test stride and norm\n    patch_merge_3 = PatchEmbed(\n        in_channels=C,\n        embed_dims=embed_dims,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=0,\n        dilation=2,\n        norm_cfg=dict(type='LN'),\n        input_size=input_size)\n\n    _, shape = patch_merge_3(dummy_input)\n    # when input_size equal to real input\n    # the out_size should be equal to `init_out_size`\n    assert shape == patch_merge_3.init_out_size\n\n    input_size = (H, W)\n    dummy_input = torch.rand(B, C, H, W)\n    # test stride and norm\n    patch_merge_3 = PatchEmbed(\n        in_channels=C,\n        embed_dims=embed_dims,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=0,\n        dilation=2,\n        norm_cfg=dict(type='LN'),\n        input_size=input_size)\n\n    _, shape = patch_merge_3(dummy_input)\n    # when input_size equal to real input\n    # the out_size should be equal to `init_out_size`\n    assert shape == patch_merge_3.init_out_size\n\n    # test adap padding\n    for padding in ('same', 'corner'):\n        in_c = 2\n        embed_dims = 3\n        B = 2\n\n        # test stride is 1\n        input_size = (5, 5)\n        kernel_size = (5, 5)\n        stride = (1, 1)\n        dilation = 1\n        bias = False\n\n        x = torch.rand(B, in_c, *input_size)\n        patch_embed = PatchEmbed(\n            in_channels=in_c,\n            embed_dims=embed_dims,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_embed(x)\n        assert x_out.size() == (B, 25, 3)\n        assert out_size == (5, 5)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test kernel_size == stride\n        input_size = (5, 5)\n        kernel_size = (5, 5)\n        stride = (5, 5)\n        dilation = 1\n        bias = False\n\n        x = torch.rand(B, in_c, *input_size)\n        patch_embed = PatchEmbed(\n            in_channels=in_c,\n            embed_dims=embed_dims,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_embed(x)\n        assert x_out.size() == (B, 1, 3)\n        assert out_size == (1, 1)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test kernel_size == stride\n        input_size = (6, 5)\n        kernel_size = (5, 5)\n        stride = (5, 5)\n        dilation = 1\n        bias = False\n\n        x = torch.rand(B, in_c, *input_size)\n        patch_embed = PatchEmbed(\n            in_channels=in_c,\n            embed_dims=embed_dims,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_embed(x)\n        assert x_out.size() == (B, 2, 3)\n        assert out_size == (2, 1)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test different kernel_size with different stride\n        input_size = (6, 5)\n        kernel_size = (6, 2)\n        stride = (6, 2)\n        dilation = 1\n        bias = False\n\n        x = torch.rand(B, in_c, *input_size)\n        patch_embed = PatchEmbed(\n            in_channels=in_c,\n            embed_dims=embed_dims,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_embed(x)\n        assert x_out.size() == (B, 3, 3)\n        assert out_size == (1, 3)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n\ndef test_patch_merging():\n\n    # Test the model with int padding\n    in_c = 3\n    out_c = 4\n    kernel_size = 3\n    stride = 3\n    padding = 1\n    dilation = 1\n    bias = False\n    # test the case `pad_to_stride` is False\n    patch_merge = PatchMerging(\n        in_channels=in_c,\n        out_channels=out_c,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        bias=bias)\n    B, L, C = 1, 100, 3\n    input_size = (10, 10)\n    x = torch.rand(B, L, C)\n    x_out, out_size = patch_merge(x, input_size)\n    assert x_out.size() == (1, 16, 4)\n    assert out_size == (4, 4)\n    # assert out size is consistent with real output\n    assert x_out.size(1) == out_size[0] * out_size[1]\n    in_c = 4\n    out_c = 5\n    kernel_size = 6\n    stride = 3\n    padding = 2\n    dilation = 2\n    bias = False\n    patch_merge = PatchMerging(\n        in_channels=in_c,\n        out_channels=out_c,\n        kernel_size=kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        bias=bias)\n    B, L, C = 1, 100, 4\n    input_size = (10, 10)\n    x = torch.rand(B, L, C)\n    x_out, out_size = patch_merge(x, input_size)\n    assert x_out.size() == (1, 4, 5)\n    assert out_size == (2, 2)\n    # assert out size is consistent with real output\n    assert x_out.size(1) == out_size[0] * out_size[1]\n\n    # Test with adaptive padding\n    for padding in ('same', 'corner'):\n        in_c = 2\n        out_c = 3\n        B = 2\n\n        # test stride is 1\n        input_size = (5, 5)\n        kernel_size = (5, 5)\n        stride = (1, 1)\n        dilation = 1\n        bias = False\n        L = input_size[0] * input_size[1]\n\n        x = torch.rand(B, L, in_c)\n        patch_merge = PatchMerging(\n            in_channels=in_c,\n            out_channels=out_c,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_merge(x, input_size)\n        assert x_out.size() == (B, 25, 3)\n        assert out_size == (5, 5)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test kernel_size == stride\n        input_size = (5, 5)\n        kernel_size = (5, 5)\n        stride = (5, 5)\n        dilation = 1\n        bias = False\n        L = input_size[0] * input_size[1]\n\n        x = torch.rand(B, L, in_c)\n        patch_merge = PatchMerging(\n            in_channels=in_c,\n            out_channels=out_c,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_merge(x, input_size)\n        assert x_out.size() == (B, 1, 3)\n        assert out_size == (1, 1)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test kernel_size == stride\n        input_size = (6, 5)\n        kernel_size = (5, 5)\n        stride = (5, 5)\n        dilation = 1\n        bias = False\n        L = input_size[0] * input_size[1]\n\n        x = torch.rand(B, L, in_c)\n        patch_merge = PatchMerging(\n            in_channels=in_c,\n            out_channels=out_c,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_merge(x, input_size)\n        assert x_out.size() == (B, 2, 3)\n        assert out_size == (2, 1)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n        # test different kernel_size with different stride\n        input_size = (6, 5)\n        kernel_size = (6, 2)\n        stride = (6, 2)\n        dilation = 1\n        bias = False\n        L = input_size[0] * input_size[1]\n\n        x = torch.rand(B, L, in_c)\n        patch_merge = PatchMerging(\n            in_channels=in_c,\n            out_channels=out_c,\n            kernel_size=kernel_size,\n            stride=stride,\n            padding=padding,\n            dilation=dilation,\n            bias=bias)\n\n        x_out, out_size = patch_merge(x, input_size)\n        assert x_out.size() == (B, 3, 3)\n        assert out_size == (1, 3)\n        assert x_out.size(1) == out_size[0] * out_size[1]\n\n\ndef test_multiheadattention():\n    MultiheadAttention(\n        embed_dims=5,\n        num_heads=5,\n        attn_drop=0,\n        proj_drop=0,\n        dropout_layer=dict(type='Dropout', drop_prob=0.),\n        batch_first=True)\n    batch_dim = 2\n    embed_dim = 5\n    num_query = 100\n    attn_batch_first = MultiheadAttention(\n        embed_dims=5,\n        num_heads=5,\n        attn_drop=0,\n        proj_drop=0,\n        dropout_layer=dict(type='DropPath', drop_prob=0.),\n        batch_first=True)\n\n    attn_query_first = MultiheadAttention(\n        embed_dims=5,\n        num_heads=5,\n        attn_drop=0,\n        proj_drop=0,\n        dropout_layer=dict(type='DropPath', drop_prob=0.),\n        batch_first=False)\n\n    param_dict = dict(attn_query_first.named_parameters())\n    for n, v in attn_batch_first.named_parameters():\n        param_dict[n].data = v.data\n\n    input_batch_first = torch.rand(batch_dim, num_query, embed_dim)\n    input_query_first = input_batch_first.transpose(0, 1)\n\n    assert torch.allclose(\n        attn_query_first(input_query_first).sum(),\n        attn_batch_first(input_batch_first).sum())\n\n    key_batch_first = torch.rand(batch_dim, num_query, embed_dim)\n    key_query_first = key_batch_first.transpose(0, 1)\n\n    assert torch.allclose(\n        attn_query_first(input_query_first, key_query_first).sum(),\n        attn_batch_first(input_batch_first, key_batch_first).sum())\n\n    identity = torch.ones_like(input_query_first)\n\n    # check deprecated arguments can be used normally\n\n    assert torch.allclose(\n        attn_query_first(\n            input_query_first, key_query_first, residual=identity).sum(),\n        attn_batch_first(input_batch_first, key_batch_first).sum() +\n        identity.sum() - input_batch_first.sum())\n\n    assert torch.allclose(\n        attn_query_first(\n            input_query_first, key_query_first, identity=identity).sum(),\n        attn_batch_first(input_batch_first, key_batch_first).sum() +\n        identity.sum() - input_batch_first.sum())\n\n    attn_query_first(\n        input_query_first, key_query_first, identity=identity).sum(),\n\n\ndef test_ffn():\n    with pytest.raises(AssertionError):\n        # num_fcs should be no less than 2\n        FFN(num_fcs=1)\n    ffn = FFN(dropout=0, add_identity=True)\n\n    input_tensor = torch.rand(2, 20, 256)\n    input_tensor_nbc = input_tensor.transpose(0, 1)\n    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())\n    residual = torch.rand_like(input_tensor)\n    torch.allclose(\n        ffn(input_tensor, residual=residual).sum(),\n        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())\n\n    torch.allclose(\n        ffn(input_tensor, identity=residual).sum(),\n        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())\n\n    # test with layer_scale\n    ffn = FFN(dropout=0, add_identity=True, layer_scale_init_value=0.1)\n\n    input_tensor = torch.rand(2, 20, 256)\n    input_tensor_nbc = input_tensor.transpose(0, 1)\n    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')\ndef test_basetransformerlayer_cuda():\n    # To test if the BaseTransformerLayer's behaviour remains\n    # consistent after being deepcopied\n    operation_order = ('self_attn', 'ffn')\n    baselayer = BaseTransformerLayer(\n        operation_order=operation_order,\n        batch_first=True,\n        attn_cfgs=dict(\n            type='MultiheadAttention',\n            embed_dims=256,\n            num_heads=8,\n        ),\n    )\n    baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])\n    baselayers.to('cuda')\n    x = torch.rand(2, 10, 256).cuda()\n    for m in baselayers:\n        x = m(x)\n        assert x.shape == torch.Size([2, 10, 256])\n\n\n@pytest.mark.parametrize('embed_dims', [False, 256])\ndef test_basetransformerlayer(embed_dims):\n    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),\n    if embed_dims:\n        ffn_cfgs = dict(\n            type='FFN',\n            embed_dims=embed_dims,\n            feedforward_channels=1024,\n            num_fcs=2,\n            ffn_drop=0.,\n            act_cfg=dict(type='ReLU', inplace=True),\n        )\n    else:\n        ffn_cfgs = dict(\n            type='FFN',\n            feedforward_channels=1024,\n            num_fcs=2,\n            ffn_drop=0.,\n            act_cfg=dict(type='ReLU', inplace=True),\n        )\n\n    feedforward_channels = 2048\n    ffn_dropout = 0.1\n    operation_order = ('self_attn', 'norm', 'ffn', 'norm')\n\n    # test deprecated_args\n    baselayer = BaseTransformerLayer(\n        attn_cfgs=attn_cfgs,\n        ffn_cfgs=ffn_cfgs,\n        feedforward_channels=feedforward_channels,\n        ffn_dropout=ffn_dropout,\n        operation_order=operation_order)\n    assert baselayer.batch_first is False\n    assert baselayer.ffns[0].feedforward_channels == feedforward_channels\n\n    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),\n    feedforward_channels = 2048\n    ffn_dropout = 0.1\n    operation_order = ('self_attn', 'norm', 'ffn', 'norm')\n    baselayer = BaseTransformerLayer(\n        attn_cfgs=attn_cfgs,\n        feedforward_channels=feedforward_channels,\n        ffn_dropout=ffn_dropout,\n        operation_order=operation_order,\n        batch_first=True)\n    assert baselayer.attentions[0].batch_first\n    in_tensor = torch.rand(2, 10, 256)\n    baselayer(in_tensor)\n\n\ndef test_transformerlayersequence():\n    squeue = TransformerLayerSequence(\n        num_layers=6,\n        transformerlayers=dict(\n            type='BaseTransformerLayer',\n            attn_cfgs=[\n                dict(\n                    type='MultiheadAttention',\n                    embed_dims=256,\n                    num_heads=8,\n                    dropout=0.1),\n                dict(type='MultiheadAttention', embed_dims=256, num_heads=4)\n            ],\n            feedforward_channels=1024,\n            ffn_dropout=0.1,\n            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',\n                             'norm')))\n    assert len(squeue.layers) == 6\n    assert squeue.pre_norm is False\n    with pytest.raises(AssertionError):\n        # if transformerlayers is a list, len(transformerlayers)\n        # should be equal to num_layers\n        TransformerLayerSequence(\n            num_layers=6,\n            transformerlayers=[\n                dict(\n                    type='BaseTransformerLayer',\n                    attn_cfgs=[\n                        dict(\n                            type='MultiheadAttention',\n                            embed_dims=256,\n                            num_heads=8,\n                            dropout=0.1),\n                        dict(type='MultiheadAttention', embed_dims=256)\n                    ],\n                    feedforward_channels=1024,\n                    ffn_dropout=0.1,\n                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',\n                                     'ffn', 'norm'))\n            ])\n\n\ndef test_drop_path():\n    drop_path = DropPath(drop_prob=0)\n    test_in = torch.rand(2, 3, 4, 5)\n    assert test_in is drop_path(test_in)\n\n    drop_path = DropPath(drop_prob=0.1)\n    drop_path.training = False\n    test_in = torch.rand(2, 3, 4, 5)\n    assert test_in is drop_path(test_in)\n    drop_path.training = True\n    assert test_in is not drop_path(test_in)\n"
  },
  {
    "path": "tests/test_cnn/test_wrappers.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nfrom unittest.mock import patch\n\nimport pytest\nimport torch\nimport torch.nn as nn\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfrom mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,\n                             Linear, MaxPool2d, MaxPool3d)\n\nif torch.__version__ != 'parrots':\n    torch_version = '1.1'\nelse:\n    torch_version = 'parrots'\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',\n    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])\ndef test_conv2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,\n                padding, dilation):\n    \"\"\"\n    CommandLine:\n        xdoctest -m tests/test_wrappers.py test_conv2d\n    \"\"\"\n    # train mode\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_h, in_w)\n    torch.manual_seed(0)\n    wrapper = Conv2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_h, in_w).requires_grad_(True)\n    torch.manual_seed(0)\n    ref = nn.Conv2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    wrapper_out.sum().backward()\n    assert wrapper.weight.grad is not None\n    assert wrapper.weight.grad.shape == wrapper.weight.shape\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n    # eval mode\n    x_empty = torch.randn(0, in_channel, in_h, in_w)\n    wrapper = Conv2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    wrapper.eval()\n    wrapper(x_empty)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501\n    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])\ndef test_conv3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size, stride,\n                padding, dilation):\n    \"\"\"\n    CommandLine:\n        xdoctest -m tests/test_wrappers.py test_conv3d\n    \"\"\"\n    # train mode\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)\n    torch.manual_seed(0)\n    wrapper = Conv3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_t, in_h,\n                           in_w).requires_grad_(True)\n    torch.manual_seed(0)\n    ref = nn.Conv3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    wrapper_out.sum().backward()\n    assert wrapper.weight.grad is not None\n    assert wrapper.weight.grad.shape == wrapper.weight.shape\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n    # eval mode\n    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)\n    wrapper = Conv3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation)\n    wrapper.eval()\n    wrapper(x_empty)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',\n    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])\ndef test_conv_transposed_2d(in_w, in_h, in_channel, out_channel, kernel_size,\n                            stride, padding, dilation):\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)\n    # out padding must be smaller than either stride or dilation\n    op = min(stride, dilation) - 1\n    if torch.__version__ == 'parrots':\n        op = 0\n    torch.manual_seed(0)\n    wrapper = ConvTranspose2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_h, in_w)\n    torch.manual_seed(0)\n    ref = nn.ConvTranspose2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    wrapper_out.sum().backward()\n    assert wrapper.weight.grad is not None\n    assert wrapper.weight.grad.shape == wrapper.weight.shape\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n    # eval mode\n    x_empty = torch.randn(0, in_channel, in_h, in_w)\n    wrapper = ConvTranspose2d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    wrapper.eval()\n    wrapper(x_empty)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501\n    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])\ndef test_conv_transposed_3d(in_w, in_h, in_t, in_channel, out_channel,\n                            kernel_size, stride, padding, dilation):\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)\n    # out padding must be smaller than either stride or dilation\n    op = min(stride, dilation) - 1\n    torch.manual_seed(0)\n    wrapper = ConvTranspose3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)\n    torch.manual_seed(0)\n    ref = nn.ConvTranspose3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    wrapper_out.sum().backward()\n    assert wrapper.weight.grad is not None\n    assert wrapper.weight.grad.shape == wrapper.weight.shape\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n    # eval mode\n    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)\n    wrapper = ConvTranspose3d(\n        in_channel,\n        out_channel,\n        kernel_size,\n        stride=stride,\n        padding=padding,\n        dilation=dilation,\n        output_padding=op)\n    wrapper.eval()\n    wrapper(x_empty)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',\n    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])\ndef test_max_pool_2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,\n                     padding, dilation):\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)\n    wrapper = MaxPool2d(\n        kernel_size, stride=stride, padding=padding, dilation=dilation)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_h, in_w)\n    ref = nn.MaxPool2d(\n        kernel_size, stride=stride, padding=padding, dilation=dilation)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize(\n    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501\n    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])\n@pytest.mark.skipif(\n    torch.__version__ == 'parrots' and not torch.cuda.is_available(),\n    reason='parrots requires CUDA support')\ndef test_max_pool_3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size,\n                     stride, padding, dilation):\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)\n    wrapper = MaxPool3d(\n        kernel_size, stride=stride, padding=padding, dilation=dilation)\n    if torch.__version__ == 'parrots':\n        x_empty = x_empty.cuda()\n    wrapper_out = wrapper(x_empty)\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)\n    ref = nn.MaxPool3d(\n        kernel_size, stride=stride, padding=padding, dilation=dilation)\n    if torch.__version__ == 'parrots':\n        x_normal = x_normal.cuda()\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n\n@patch('torch.__version__', torch_version)\n@pytest.mark.parametrize('in_w,in_h,in_feature,out_feature', [(10, 10, 1, 1),\n                                                              (20, 20, 3, 3)])\ndef test_linear(in_w, in_h, in_feature, out_feature):\n    # wrapper op with 0-dim input\n    x_empty = torch.randn(0, in_feature, requires_grad=True)\n    torch.manual_seed(0)\n    wrapper = Linear(in_feature, out_feature)\n    wrapper_out = wrapper(x_empty)\n\n    # torch op with 3-dim input as shape reference\n    x_normal = torch.randn(3, in_feature)\n    torch.manual_seed(0)\n    ref = nn.Linear(in_feature, out_feature)\n    ref_out = ref(x_normal)\n\n    assert wrapper_out.shape[0] == 0\n    assert wrapper_out.shape[1:] == ref_out.shape[1:]\n\n    wrapper_out.sum().backward()\n    assert wrapper.weight.grad is not None\n    assert wrapper.weight.grad.shape == wrapper.weight.shape\n\n    assert torch.equal(wrapper(x_normal), ref_out)\n\n    # eval mode\n    x_empty = torch.randn(0, in_feature)\n    wrapper = Linear(in_feature, out_feature)\n    wrapper.eval()\n    wrapper(x_empty)\n\n\n@patch('mmcv.cnn.bricks.wrappers.TORCH_VERSION', (1, 10))\ndef test_nn_op_forward_called():\n\n    for m in ['Conv2d', 'ConvTranspose2d', 'MaxPool2d']:\n        with patch(f'torch.nn.{m}.forward') as nn_module_forward:\n            # randn input\n            x_empty = torch.randn(0, 3, 10, 10)\n            wrapper = eval(m)(3, 2, 1)\n            wrapper(x_empty)\n            nn_module_forward.assert_called_with(x_empty)\n\n            # non-randn input\n            x_normal = torch.randn(1, 3, 10, 10)\n            wrapper = eval(m)(3, 2, 1)\n            wrapper(x_normal)\n            nn_module_forward.assert_called_with(x_normal)\n\n    for m in ['Conv3d', 'ConvTranspose3d', 'MaxPool3d']:\n        with patch(f'torch.nn.{m}.forward') as nn_module_forward:\n            # randn input\n            x_empty = torch.randn(0, 3, 10, 10, 10)\n            wrapper = eval(m)(3, 2, 1)\n            wrapper(x_empty)\n            nn_module_forward.assert_called_with(x_empty)\n\n            # non-randn input\n            x_normal = torch.randn(1, 3, 10, 10, 10)\n            wrapper = eval(m)(3, 2, 1)\n            wrapper(x_normal)\n            nn_module_forward.assert_called_with(x_normal)\n\n    with patch('torch.nn.Linear.forward') as nn_module_forward:\n        # randn input\n        x_empty = torch.randn(0, 3)\n        wrapper = Linear(3, 3)\n        wrapper(x_empty)\n        nn_module_forward.assert_called_with(x_empty)\n\n        # non-randn input\n        x_normal = torch.randn(1, 3)\n        wrapper = Linear(3, 3)\n        wrapper(x_normal)\n        nn_module_forward.assert_called_with(x_normal)\n\n\n@pytest.mark.skipif(\n    digit_version(TORCH_VERSION) < digit_version('1.10'),\n    reason='MaxPool2d and MaxPool3d will fail fx for torch<=1.9')\ndef test_fx_compatibility():\n    from torch import fx\n\n    # ensure the fx trace can pass the network\n    for Net in (MaxPool2d, MaxPool3d):\n        net = Net(1)\n        gm_module = fx.symbolic_trace(net)  # noqa: F841\n    for Net in (Linear, ):\n        net = Net(1, 1)\n        gm_module = fx.symbolic_trace(net)  # noqa: F841\n    for Net in (Conv2d, ConvTranspose2d, Conv3d, ConvTranspose3d):\n        net = Net(1, 1, 1)\n        gm_module = fx.symbolic_trace(net)  # noqa: F841\n"
  },
  {
    "path": "tests/test_image/test_colorspace.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport cv2\nimport numpy as np\nimport pytest\nfrom numpy.testing import assert_array_almost_equal, assert_array_equal\n\nimport mmcv\nfrom mmcv.image.colorspace import (_convert_input_type_range,\n                                   _convert_output_type_range)\n\n\ndef test_bgr2gray():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2gray(in_img)\n    computed_gray = (\n        in_img[:, :, 0] * 0.114 + in_img[:, :, 1] * 0.587 +\n        in_img[:, :, 2] * 0.299)\n    assert_array_almost_equal(out_img, computed_gray, decimal=4)\n    out_img_3d = mmcv.bgr2gray(in_img, True)\n    assert out_img_3d.shape == (10, 10, 1)\n    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)\n\n\ndef test_rgb2gray():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.rgb2gray(in_img)\n    computed_gray = (\n        in_img[:, :, 0] * 0.299 + in_img[:, :, 1] * 0.587 +\n        in_img[:, :, 2] * 0.114)\n    assert_array_almost_equal(out_img, computed_gray, decimal=4)\n    out_img_3d = mmcv.rgb2gray(in_img, True)\n    assert out_img_3d.shape == (10, 10, 1)\n    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)\n\n\ndef test_gray2bgr():\n    in_img = np.random.rand(10, 10).astype(np.float32)\n    out_img = mmcv.gray2bgr(in_img)\n    assert out_img.shape == (10, 10, 3)\n    for i in range(3):\n        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)\n\n\ndef test_gray2rgb():\n    in_img = np.random.rand(10, 10).astype(np.float32)\n    out_img = mmcv.gray2rgb(in_img)\n    assert out_img.shape == (10, 10, 3)\n    for i in range(3):\n        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)\n\n\ndef test_bgr2rgb():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2rgb(in_img)\n    assert out_img.shape == in_img.shape\n    assert_array_equal(out_img[..., 0], in_img[..., 2])\n    assert_array_equal(out_img[..., 1], in_img[..., 1])\n    assert_array_equal(out_img[..., 2], in_img[..., 0])\n\n\ndef test_rgb2bgr():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.rgb2bgr(in_img)\n    assert out_img.shape == in_img.shape\n    assert_array_equal(out_img[..., 0], in_img[..., 2])\n    assert_array_equal(out_img[..., 1], in_img[..., 1])\n    assert_array_equal(out_img[..., 2], in_img[..., 0])\n\n\ndef test_bgr2hsv():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2hsv(in_img)\n    argmax = in_img.argmax(axis=2)\n    computed_hsv = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            v = max(r, g, b)\n            s = (v - min(r, g, b)) / v if v != 0 else 0\n            if argmax[i, j] == 0:\n                h = 240 + 60 * (r - g) / (v - min(r, g, b))\n            elif argmax[i, j] == 1:\n                h = 120 + 60 * (b - r) / (v - min(r, g, b))\n            else:\n                h = 60 * (g - b) / (v - min(r, g, b))\n            if h < 0:\n                h += 360\n            computed_hsv[i, j, :] = [h, s, v]\n    assert_array_almost_equal(out_img, computed_hsv, decimal=2)\n\n\ndef test_convert_input_type_range():\n    with pytest.raises(TypeError):\n        # The img type should be np.float32 or np.uint8\n        in_img = np.random.rand(10, 10, 3).astype(np.uint64)\n        _convert_input_type_range(in_img)\n    # np.float32\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = _convert_input_type_range(in_img)\n    assert out_img.dtype == np.float32\n    assert np.absolute(out_img).mean() < 1\n    # np.uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = _convert_input_type_range(in_img)\n    assert out_img.dtype == np.float32\n    assert np.absolute(out_img).mean() < 1\n\n\ndef test_convert_output_type_range():\n    with pytest.raises(TypeError):\n        # The dst_type should be np.float32 or np.uint8\n        in_img = np.random.rand(10, 10, 3).astype(np.float32)\n        _convert_output_type_range(in_img, np.uint64)\n    # np.float32\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)\n    out_img = _convert_output_type_range(in_img, np.float32)\n    assert out_img.dtype == np.float32\n    assert np.absolute(out_img).mean() < 1\n    # np.uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)\n    out_img = _convert_output_type_range(in_img, np.uint8)\n    assert out_img.dtype == np.uint8\n    assert np.absolute(out_img).mean() > 1\n\n\ndef assert_image_almost_equal(x, y, atol=1):\n    assert x.dtype == np.uint8\n    assert y.dtype == np.uint8\n    assert np.all(np.abs(x.astype(np.int32) - y.astype(np.int32)) <= atol)\n\n\ndef test_rgb2ycbcr():\n    with pytest.raises(TypeError):\n        # The img type should be np.float32 or np.uint8\n        in_img = np.random.rand(10, 10, 3).astype(np.uint64)\n        mmcv.rgb2ycbcr(in_img)\n\n    # float32\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.rgb2ycbcr(in_img)\n    computed_ycbcr = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            r, g, b = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0\n            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214\n            computed_ycbcr[i, j, :] = [y, cb, cr]\n    computed_ycbcr /= 255.\n    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)\n    # y_only=True\n    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)\n    computed_y = np.empty_like(out_img, dtype=out_img.dtype)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            r, g, b = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            computed_y[i, j] = y\n    computed_y /= 255.\n    assert_array_almost_equal(out_img, computed_y, decimal=2)\n\n    # uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.rgb2ycbcr(in_img)\n    computed_ycbcr = np.empty_like(in_img)\n    in_img = in_img / 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            r, g, b = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0\n            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214\n            y, cb, cr = y.round(), cb.round(), cr.round()\n            computed_ycbcr[i, j, :] = [y, cb, cr]\n    assert_image_almost_equal(out_img, computed_ycbcr)\n    # y_only=True\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)\n    computed_y = np.empty_like(out_img, dtype=out_img.dtype)\n    in_img = in_img / 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            r, g, b = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            y = y.round()\n            computed_y[i, j] = y\n    assert_image_almost_equal(out_img, computed_y)\n\n\ndef test_bgr2ycbcr():\n    # float32\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2ycbcr(in_img)\n    computed_ycbcr = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0\n            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214\n            computed_ycbcr[i, j, :] = [y, cb, cr]\n    computed_ycbcr /= 255.\n    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)\n    # y_only=True\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)\n    computed_y = np.empty_like(out_img, dtype=out_img.dtype)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            computed_y[i, j] = y\n    computed_y /= 255.\n    assert_array_almost_equal(out_img, computed_y, decimal=2)\n\n    # uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.bgr2ycbcr(in_img)\n    computed_ycbcr = np.empty_like(in_img)\n    in_img = in_img / 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0\n            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214\n            y, cb, cr = y.round(), cb.round(), cr.round()\n            computed_ycbcr[i, j, :] = [y, cb, cr]\n    assert_image_almost_equal(out_img, computed_ycbcr)\n    # y_only = True\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)\n    computed_y = np.empty_like(out_img, dtype=out_img.dtype)\n    in_img = in_img / 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            y = 16 + r * 65.481 + g * 128.553 + b * 24.966\n            y = y.round()\n            computed_y[i, j] = y\n    assert_image_almost_equal(out_img, computed_y)\n\n\ndef test_ycbcr2rgb():\n    with pytest.raises(TypeError):\n        # The img type should be np.float32 or np.uint8\n        in_img = np.random.rand(10, 10, 3).astype(np.uint64)\n        mmcv.ycbcr2rgb(in_img)\n\n    # float32\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.ycbcr2rgb(in_img)\n    computed_rgb = np.empty_like(in_img)\n    in_img *= 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            y, cb, cr = in_img[i, j]\n            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255\n            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \\\n                cr * 0.00318811 * 255\n            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255\n            computed_rgb[i, j, :] = [r, g, b]\n    computed_rgb /= 255.\n    assert_array_almost_equal(out_img, computed_rgb, decimal=2)\n\n    # uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.ycbcr2rgb(in_img)\n    computed_rgb = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            y, cb, cr = in_img[i, j]\n            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255\n            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \\\n                cr * 0.00318811 * 255\n            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255\n            r, g, b = r.round(), g.round(), b.round()\n            computed_rgb[i, j, :] = [r, g, b]\n    assert_image_almost_equal(out_img, computed_rgb)\n\n\ndef test_ycbcr2bgr():\n    # float32\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.ycbcr2bgr(in_img)\n    computed_bgr = np.empty_like(in_img)\n    in_img *= 255.\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            y, cb, cr = in_img[i, j]\n            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255\n            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \\\n                cr * 0.00318811 * 255\n            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255\n            computed_bgr[i, j, :] = [b, g, r]\n    computed_bgr /= 255.\n    assert_array_almost_equal(out_img, computed_bgr, decimal=2)\n\n    # uint8\n    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n    out_img = mmcv.ycbcr2bgr(in_img)\n    computed_bgr = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            y, cb, cr = in_img[i, j]\n            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255\n            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \\\n                cr * 0.00318811 * 255\n            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255\n            r, g, b = r.round(), g.round(), b.round()\n            computed_bgr[i, j, :] = [b, g, r]\n    assert_image_almost_equal(out_img, computed_bgr)\n\n\ndef test_bgr2hls():\n    in_img = np.random.rand(10, 10, 3).astype(np.float32)\n    out_img = mmcv.bgr2hls(in_img)\n    argmax = in_img.argmax(axis=2)\n    computed_hls = np.empty_like(in_img)\n    for i in range(in_img.shape[0]):\n        for j in range(in_img.shape[1]):\n            b, g, r = in_img[i, j]\n            maxc = max(r, g, b)\n            minc = min(r, g, b)\n            _l = (minc + maxc) / 2.0\n            if minc == maxc:\n                h = 0.0\n                s = 0.0\n            if _l <= 0.5:\n                s = (maxc - minc) / (maxc + minc)\n            else:\n                s = (maxc - minc) / (2.0 - maxc - minc)\n            if argmax[i, j] == 2:\n                h = 60 * (g - b) / (maxc - minc)\n            elif argmax[i, j] == 1:\n                h = 60 * (2.0 + (b - r) / (maxc - minc))\n            else:\n                h = 60 * (4.0 + (r - g) / (maxc - minc))\n            if h < 0:\n                h += 360\n            computed_hls[i, j, :] = [h, _l, s]\n    assert_array_almost_equal(out_img, computed_hls, decimal=2)\n\n\n@pytest.mark.parametrize('src,dst,ref', [('bgr', 'gray', cv2.COLOR_BGR2GRAY),\n                                         ('rgb', 'gray', cv2.COLOR_RGB2GRAY),\n                                         ('bgr', 'rgb', cv2.COLOR_BGR2RGB),\n                                         ('rgb', 'bgr', cv2.COLOR_RGB2BGR),\n                                         ('bgr', 'hsv', cv2.COLOR_BGR2HSV),\n                                         ('hsv', 'bgr', cv2.COLOR_HSV2BGR),\n                                         ('bgr', 'hls', cv2.COLOR_BGR2HLS),\n                                         ('hls', 'bgr', cv2.COLOR_HLS2BGR)])\ndef test_imconvert(src, dst, ref):\n    img = np.random.rand(10, 10, 3).astype(np.float32)\n    assert_array_equal(mmcv.imconvert(img, src, dst), cv2.cvtColor(img, ref))\n"
  },
  {
    "path": "tests/test_image/test_geometric.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os.path as osp\n\nimport cv2\nimport numpy as np\nimport pytest\nfrom numpy.testing import assert_array_equal\n\nimport mmcv\n\n\nclass TestGeometric:\n\n    @classmethod\n    def setup_class(cls):\n        cls.data_dir = osp.join(osp.dirname(__file__), '../data')\n        # the test img resolution is 400x300\n        cls.img_path = osp.join(cls.data_dir, 'color.jpg')\n        cls.img = cv2.imread(cls.img_path)\n\n    def test_imresize(self):\n        resized_img = mmcv.imresize(self.img, (1000, 600))\n        assert resized_img.shape == (600, 1000, 3)\n        resized_img, w_scale, h_scale = mmcv.imresize(self.img, (1000, 600),\n                                                      True)\n        assert (resized_img.shape == (600, 1000, 3) and w_scale == 2.5\n                and h_scale == 2.0)\n        resized_img_dst = np.empty((600, 1000, 3), dtype=self.img.dtype)\n        resized_img = mmcv.imresize(self.img, (1000, 600), out=resized_img_dst)\n        assert id(resized_img_dst) == id(resized_img)\n        assert_array_equal(resized_img_dst,\n                           mmcv.imresize(self.img, (1000, 600)))\n        for mode in ['nearest', 'bilinear', 'bicubic', 'area', 'lanczos']:\n            resized_img = mmcv.imresize(\n                self.img, (1000, 600), interpolation=mode)\n            assert resized_img.shape == (600, 1000, 3)\n\n        # test pillow resize\n        for mode in [\n                'nearest', 'bilinear', 'bicubic', 'box', 'lanczos', 'hamming'\n        ]:\n            resized_img = mmcv.imresize(\n                self.img, (1000, 600), interpolation=mode, backend='pillow')\n            assert resized_img.shape == (600, 1000, 3)\n\n        # resize backend must be 'cv2' or 'pillow'\n        with pytest.raises(ValueError):\n            mmcv.imresize(self.img, (1000, 600), backend='not support')\n\n    def test_imresize_to_multiple(self):\n        # test size and keep_ratio = False\n        resized_img = mmcv.imresize_to_multiple(\n            self.img, divisor=16, size=(511, 513), keep_ratio=False)\n        assert resized_img.shape == (528, 512, 3)\n        resized_img = mmcv.imresize_to_multiple(\n            self.img, divisor=(16, 32), size=(511, 513), keep_ratio=False)\n        assert resized_img.shape == (544, 512, 3)\n\n        # test size, keep_ratio = True, and return_scale\n        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(\n            self.img,\n            divisor=16,\n            size=(1000, 600),\n            keep_ratio=True,\n            return_scale=True)\n        assert resized_img.shape == (\n            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400\n        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(\n            self.img,\n            divisor=(18, 16),\n            size=(1000, 600),\n            keep_ratio=True,\n            return_scale=True)\n        assert resized_img.shape == (\n            608, 810, 3) and h_scale == 608 / 300 and w_scale == 810 / 400\n\n        # test scale_factor and return_scale\n        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(\n            self.img, divisor=16, scale_factor=2, return_scale=True)\n        assert resized_img.shape == (\n            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400\n        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(\n            self.img, divisor=16, scale_factor=(2, 3), return_scale=True)\n        assert resized_img.shape == (\n            912, 800, 3) and h_scale == 912 / 300 and w_scale == 800 / 400\n        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(\n            self.img, divisor=(18, 16), scale_factor=(2, 3), return_scale=True)\n        assert resized_img.shape == (\n            912, 810, 3) and h_scale == 912 / 300 and w_scale == 810 / 400\n\n        # one of size and scale_factor should be given\n        with pytest.raises(ValueError):\n            mmcv.imresize_to_multiple(\n                self.img, divisor=16, size=(1000, 600), scale_factor=2)\n        with pytest.raises(ValueError):\n            mmcv.imresize_to_multiple(\n                self.img, divisor=16, size=None, scale_factor=None)\n\n    def test_imresize_like(self):\n        a = np.zeros((100, 200, 3))\n        resized_img = mmcv.imresize_like(self.img, a)\n        assert resized_img.shape == (100, 200, 3)\n\n    def test_rescale_size(self):\n        new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True)\n        assert new_size == (600, 450) and scale_factor == 1.5\n        new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True)\n        assert new_size == (374, 280) and scale_factor == 0.934\n\n        new_size = mmcv.rescale_size((400, 300), 1.5)\n        assert new_size == (600, 450)\n        new_size = mmcv.rescale_size((400, 300), 0.934)\n        assert new_size == (374, 280)\n\n        new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600),\n                                                   True)\n        assert new_size == (800, 600) and scale_factor == 2.0\n        new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200),\n                                                   True)\n        assert new_size == (200, 150) and scale_factor == 0.5\n\n        new_size = mmcv.rescale_size((400, 300), (1000, 600))\n        assert new_size == (800, 600)\n        new_size = mmcv.rescale_size((400, 300), (180, 200))\n        assert new_size == (200, 150)\n\n        with pytest.raises(ValueError):\n            mmcv.rescale_size((400, 300), -0.5)\n        with pytest.raises(TypeError):\n            mmcv.rescale_size()((400, 300), [100, 100])\n\n    def test_imrescale(self):\n        # rescale by a certain factor\n        resized_img = mmcv.imrescale(self.img, 1.5)\n        assert resized_img.shape == (450, 600, 3)\n        resized_img = mmcv.imrescale(self.img, 0.934)\n        assert resized_img.shape == (280, 374, 3)\n\n        # rescale by a certain max_size\n        # resize (400, 300) to (max_1000, max_600)\n        resized_img = mmcv.imrescale(self.img, (1000, 600))\n        assert resized_img.shape == (600, 800, 3)\n        resized_img, scale = mmcv.imrescale(\n            self.img, (1000, 600), return_scale=True)\n        assert resized_img.shape == (600, 800, 3) and scale == 2.0\n        # resize (400, 300) to (max_200, max_180)\n        resized_img = mmcv.imrescale(self.img, (180, 200))\n        assert resized_img.shape == (150, 200, 3)\n        resized_img, scale = mmcv.imrescale(\n            self.img, (180, 200), return_scale=True)\n        assert resized_img.shape == (150, 200, 3) and scale == 0.5\n\n        # test exceptions\n        with pytest.raises(ValueError):\n            mmcv.imrescale(self.img, -0.5)\n        with pytest.raises(TypeError):\n            mmcv.imrescale(self.img, [100, 100])\n\n    def test_imflip(self):\n        # direction must be \"horizontal\" or \"vertical\" or \"diagonal\"\n        with pytest.raises(AssertionError):\n            mmcv.imflip(np.random.rand(80, 60, 3), direction='random')\n\n        # test horizontal flip (color image)\n        img = np.random.rand(80, 60, 3)\n        h, w, c = img.shape\n        flipped_img = mmcv.imflip(img)\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]\n\n        # test vertical flip (color image)\n        flipped_img = mmcv.imflip(img, direction='vertical')\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]\n\n        # test diagonal flip (color image)\n        flipped_img = mmcv.imflip(img, direction='diagonal')\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]\n\n        # test horizontal flip (grayscale image)\n        img = np.random.rand(80, 60)\n        h, w = img.shape\n        flipped_img = mmcv.imflip(img)\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[i, w - 1 - j]\n\n        # test vertical flip (grayscale image)\n        flipped_img = mmcv.imflip(img, direction='vertical')\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[h - 1 - i, j]\n\n        # test diagonal flip (grayscale image)\n        flipped_img = mmcv.imflip(img, direction='diagonal')\n        assert flipped_img.shape == img.shape\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]\n\n    def test_imflip_(self):\n        # direction must be \"horizontal\" or \"vertical\" or \"diagonal\"\n        with pytest.raises(AssertionError):\n            mmcv.imflip_(np.random.rand(80, 60, 3), direction='random')\n\n        # test horizontal flip (color image)\n        img = np.random.rand(80, 60, 3)\n        h, w, c = img.shape\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip)\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]\n                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]\n\n        # test vertical flip (color image)\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]\n                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]\n\n        # test diagonal flip (color image)\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                for k in range(c):\n                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]\n                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]\n\n        # test horizontal flip (grayscale image)\n        img = np.random.rand(80, 60)\n        h, w = img.shape\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip)\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[i, w - 1 - j]\n                assert flipped_img[i, j] == img_for_flip[i, j]\n\n        # test vertical flip (grayscale image)\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[h - 1 - i, j]\n                assert flipped_img[i, j] == img_for_flip[i, j]\n\n        # test diagonal flip (grayscale image)\n        img_for_flip = img.copy()\n        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')\n        assert flipped_img.shape == img.shape\n        assert flipped_img.shape == img_for_flip.shape\n        assert id(flipped_img) == id(img_for_flip)\n        for i in range(h):\n            for j in range(w):\n                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]\n                assert flipped_img[i, j] == img_for_flip[i, j]\n\n    def test_imcrop(self):\n        # yapf: disable\n        bboxes = np.array([[100, 100, 199, 199],  # center\n                           [0, 0, 150, 100],  # left-top corner\n                           [250, 200, 399, 299],  # right-bottom corner\n                           [0, 100, 399, 199],  # wide\n                           [150, 0, 299, 299]])  # tall\n        # yapf: enable\n\n        # crop one bbox\n        patch = mmcv.imcrop(self.img, bboxes[0, :])\n        patches = mmcv.imcrop(self.img, bboxes[[0], :])\n        assert patch.shape == (100, 100, 3)\n        patch_path = osp.join(self.data_dir, 'patches')\n        ref_patch = np.load(patch_path + '/0.npy')\n        assert_array_equal(patch, ref_patch)\n        assert isinstance(patches, list) and len(patches) == 1\n        assert_array_equal(patches[0], ref_patch)\n\n        # crop with no scaling and padding\n        patches = mmcv.imcrop(self.img, bboxes)\n        assert len(patches) == bboxes.shape[0]\n        for i in range(len(patches)):\n            ref_patch = np.load(patch_path + f'/{i}.npy')\n            assert_array_equal(patches[i], ref_patch)\n\n        # crop with scaling and no padding\n        patches = mmcv.imcrop(self.img, bboxes, 1.2)\n        for i in range(len(patches)):\n            ref_patch = np.load(patch_path + f'/scale_{i}.npy')\n            assert_array_equal(patches[i], ref_patch)\n\n        # crop with scaling and padding\n        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=[255, 255, 0])\n        for i in range(len(patches)):\n            ref_patch = np.load(patch_path + f'/pad_{i}.npy')\n            assert_array_equal(patches[i], ref_patch)\n        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=0)\n        for i in range(len(patches)):\n            ref_patch = np.load(patch_path + f'/pad0_{i}.npy')\n            assert_array_equal(patches[i], ref_patch)\n\n    def test_impad(self):\n        # grayscale image\n        img = np.random.rand(10, 10).astype(np.float32)\n        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)\n        assert_array_equal(img, padded_img[:10, :10])\n        assert_array_equal(\n            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])\n        assert_array_equal(\n            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])\n\n        # RGB image\n        img = np.random.rand(10, 10, 3).astype(np.float32)\n        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)\n        assert_array_equal(img, padded_img[:10, :10, :])\n        assert_array_equal(\n            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])\n        assert_array_equal(\n            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])\n\n        # RGB image with different values for three channels.\n        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')\n        padded_img = mmcv.impad(\n            img, padding=(0, 0, 2, 5), pad_val=(100, 110, 120))\n        assert_array_equal(img, padded_img[:10, :10, :])\n        assert_array_equal(\n            np.array([100, 110, 120], dtype='uint8') * np.ones(\n                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])\n        assert_array_equal(\n            np.array([100, 110, 120], dtype='uint8') * np.ones(\n                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])\n\n        # Pad the grayscale image to shape (15, 12)\n        img = np.random.rand(10, 10).astype(np.float32)\n        padded_img = mmcv.impad(img, shape=(15, 12))\n        assert_array_equal(img, padded_img[:10, :10])\n        assert_array_equal(\n            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])\n        assert_array_equal(\n            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])\n\n        # Pad the RGB image to shape (15, 12)\n        img = np.random.rand(10, 10, 3).astype(np.float32)\n        padded_img = mmcv.impad(img, shape=(15, 12))\n        assert_array_equal(img, padded_img[:10, :10, :])\n        assert_array_equal(\n            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])\n        assert_array_equal(\n            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])\n\n        # Pad the RGB image to shape (15, 12) with different values for\n        # three channels.\n        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')\n        padded_img = mmcv.impad(img, shape=(15, 12), pad_val=(100, 110, 120))\n        assert_array_equal(img, padded_img[:10, :10, :])\n        assert_array_equal(\n            np.array([100, 110, 120], dtype='uint8') * np.ones(\n                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])\n        assert_array_equal(\n            np.array([100, 110, 120], dtype='uint8') * np.ones(\n                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])\n\n        # RGB image with padding=[5, 2]\n        img = np.random.rand(10, 10, 3).astype(np.float32)\n        padded_img = mmcv.impad(img, padding=(5, 2), pad_val=0)\n\n        assert padded_img.shape == (14, 20, 3)\n        assert_array_equal(img, padded_img[2:12, 5:15, :])\n        assert_array_equal(\n            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, :5, :])\n        assert_array_equal(\n            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, :5, :])\n        assert_array_equal(\n            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, 15:, :])\n        assert_array_equal(\n            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, 15:, :])\n\n        # RGB image with type(pad_val) = tuple\n        pad_val = (0, 1, 2)\n        img = np.random.rand(10, 10, 3).astype(np.float32)\n        padded_img = mmcv.impad(img, padding=(0, 0, 5, 2), pad_val=pad_val)\n\n        assert padded_img.shape == (12, 15, 3)\n        assert_array_equal(img, padded_img[:10, :10, :])\n        assert_array_equal(pad_val[0] * np.ones((2, 15, 1), dtype='float32'),\n                           padded_img[10:, :, 0:1])\n        assert_array_equal(pad_val[1] * np.ones((2, 15, 1), dtype='float32'),\n                           padded_img[10:, :, 1:2])\n        assert_array_equal(pad_val[2] * np.ones((2, 15, 1), dtype='float32'),\n                           padded_img[10:, :, 2:3])\n\n        assert_array_equal(pad_val[0] * np.ones((12, 5, 1), dtype='float32'),\n                           padded_img[:, 10:, 0:1])\n        assert_array_equal(pad_val[1] * np.ones((12, 5, 1), dtype='float32'),\n                           padded_img[:, 10:, 1:2])\n        assert_array_equal(pad_val[2] * np.ones((12, 5, 1), dtype='float32'),\n                           padded_img[:, 10:, 2:3])\n\n        # test different padding mode with channel number = 3\n        for mode in ['constant', 'edge', 'reflect', 'symmetric']:\n            img = np.random.rand(10, 10, 3).astype(np.float32)\n            padded_img = mmcv.impad(\n                img, padding=(0, 0, 5, 2), pad_val=pad_val, padding_mode=mode)\n            assert padded_img.shape == (12, 15, 3)\n\n        # test different padding mode with channel number = 1\n        for mode in ['constant', 'edge', 'reflect', 'symmetric']:\n            img = np.random.rand(10, 10).astype(np.float32)\n            padded_img = mmcv.impad(\n                img, padding=(0, 0, 5, 2), pad_val=0, padding_mode=mode)\n            assert padded_img.shape == (12, 15)\n\n        # Padding must be a int or a 2, or 4 element tuple.\n        with pytest.raises(ValueError):\n            mmcv.impad(img, padding=(1, 1, 1))\n\n        # pad_val must be a int or a tuple\n        with pytest.raises(TypeError):\n            mmcv.impad(img, padding=(1, 1, 1, 1), pad_val='wrong')\n\n        # When pad_val is a tuple,\n        # len(pad_val) should be equal to img.shape[-1]\n        img = np.random.rand(10, 10, 3).astype(np.float32)\n        with pytest.raises(AssertionError):\n            mmcv.impad(img, padding=3, pad_val=(100, 200))\n\n        with pytest.raises(AssertionError):\n            mmcv.impad(img, padding=2, pad_val=0, padding_mode='unknown')\n\n        with pytest.raises(AssertionError):\n            mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))\n\n        # Pad shape smaller than image shape\n        padded_img = mmcv.impad(img, shape=(8, 8))\n        assert padded_img.shape == (10, 10, 3)\n\n    def test_impad_to_multiple(self):\n        img = np.random.rand(11, 14, 3).astype(np.float32)\n        padded_img = mmcv.impad_to_multiple(img, 4)\n        assert padded_img.shape == (12, 16, 3)\n        img = np.random.rand(20, 12).astype(np.float32)\n        padded_img = mmcv.impad_to_multiple(img, 5)\n        assert padded_img.shape == (20, 15)\n        img = np.random.rand(20, 12).astype(np.float32)\n        padded_img = mmcv.impad_to_multiple(img, 2)\n        assert padded_img.shape == (20, 12)\n\n    def test_cutout(self):\n        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)\n\n        # shape must be int or tuple\n        with pytest.raises(AssertionError):\n            mmcv.cutout(img, 2.5)\n        # pad_val must be int or float or tuple with the same length\n        # of img channels\n        with pytest.raises(AssertionError):\n            mmcv.cutout(img, 1, (1, 2, 3))\n        with pytest.raises(TypeError):\n            mmcv.cutout(img, 1, None)\n\n        # test cutout the whole img\n        assert_array_equal(mmcv.cutout(img, 6), np.zeros_like(img))\n        # test not cutout\n        assert_array_equal(mmcv.cutout(img, 0), img)\n        # test cutout when shape is int\n        np.random.seed(0)\n        img_cutout = np.array([[1, 2, 3], [4, 0, 6], [7, 8,\n                                                      9]]).astype(np.uint8)\n        assert_array_equal(mmcv.cutout(img, 1), img_cutout)\n        img_cutout = np.array([[1, 2, 3], [4, 10, 6], [7, 8,\n                                                       9]]).astype(np.uint8)\n        assert_array_equal(mmcv.cutout(img, 1, pad_val=10), img_cutout)\n        # test cutout when shape is tuple\n        np.random.seed(0)\n        img_cutout = np.array([[1, 2, 3], [0, 0, 6], [7, 8,\n                                                      9]]).astype(np.uint8)\n        assert_array_equal(mmcv.cutout(img, (1, 2)), img_cutout)\n        img_cutout = np.array([[1, 2, 3], [10, 10, 6], [7, 8,\n                                                        9]]).astype(np.uint8)\n        assert_array_equal(mmcv.cutout(img, (1, 2), pad_val=10), img_cutout)\n\n    def test_imrotate(self):\n        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)\n        assert_array_equal(mmcv.imrotate(img, 0), img)\n        img_r = np.array([[7, 4, 1], [8, 5, 2], [9, 6, 3]])\n        assert_array_equal(mmcv.imrotate(img, 90), img_r)\n        img_r = np.array([[3, 6, 9], [2, 5, 8], [1, 4, 7]])\n        assert_array_equal(mmcv.imrotate(img, -90), img_r)\n\n        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).astype(np.uint8)\n        img_r = np.array([[0, 6, 2, 0], [0, 7, 3, 0]])\n        assert_array_equal(mmcv.imrotate(img, 90), img_r)\n        img_r = np.array([[1, 0, 0, 0], [2, 0, 0, 0]])\n        assert_array_equal(mmcv.imrotate(img, 90, center=(0, 0)), img_r)\n        img_r = np.array([[255, 6, 2, 255], [255, 7, 3, 255]])\n        assert_array_equal(mmcv.imrotate(img, 90, border_value=255), img_r)\n        img_r = np.array([[5, 1], [6, 2], [7, 3], [8, 4]])\n        assert_array_equal(mmcv.imrotate(img, 90, auto_bound=True), img_r)\n        img_r = np.array([[6, 6, 2, 2], [7, 7, 3, 3]])\n        assert_array_equal(\n            mmcv.imrotate(img, 90, border_mode='replicate'), img_r)\n\n        with pytest.raises(ValueError):\n            mmcv.imrotate(img, 90, center=(0, 0), auto_bound=True)\n\n    def test_imshear(self):\n        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)\n        assert_array_equal(mmcv.imshear(img, 0), img)\n        # magnitude=1, horizontal\n        img_sheared = np.array([[1, 2, 3], [0, 4, 5], [0, 0, 7]],\n                               dtype=np.uint8)\n        assert_array_equal(mmcv.imshear(img, 1), img_sheared)\n        # magnitude=-1, vertical\n        img_sheared = np.array([[1, 5, 9], [4, 8, 0], [7, 0, 0]],\n                               dtype=np.uint8)\n        assert_array_equal(mmcv.imshear(img, -1, 'vertical'), img_sheared)\n        # magnitude=1, vertical, borderValue=100\n        borderValue = 100\n        img_sheared = np.array(\n            [[1, borderValue, borderValue], [4, 2, borderValue], [7, 5, 3]],\n            dtype=np.uint8)\n        assert_array_equal(\n            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)\n        # magnitude=1, vertical, borderValue=100, img shape (h,w,3)\n        img = np.stack([img, img, img], axis=-1)\n        img_sheared = np.stack([img_sheared, img_sheared, img_sheared],\n                               axis=-1)\n        assert_array_equal(\n            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)\n        # test tuple format of borderValue\n        assert_array_equal(\n            mmcv.imshear(img, 1, 'vertical',\n                         (borderValue, borderValue, borderValue)), img_sheared)\n\n        # test invalid length of borderValue\n        with pytest.raises(AssertionError):\n            mmcv.imshear(img, 0.5, 'horizontal', (borderValue, ))\n\n        # test invalid type of borderValue\n        with pytest.raises(ValueError):\n            mmcv.imshear(img, 0.5, 'horizontal', [borderValue])\n\n        # test invalid value of direction\n        with pytest.raises(AssertionError):\n            mmcv.imshear(img, 0.5, 'diagonal')\n\n    def test_imtranslate(self):\n        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8)\n        assert_array_equal(mmcv.imtranslate(img, 0), img)\n        # offset=1, horizontal\n        img_translated = np.array([[128, 1, 2], [128, 4, 5], [128, 7, 8]],\n                                  dtype=np.uint8)\n        assert_array_equal(\n            mmcv.imtranslate(img, 1, border_value=128), img_translated)\n        # offset=-1, vertical\n        img_translated = np.array([[4, 5, 6], [7, 8, 9], [0, 0, 0]],\n                                  dtype=np.uint8)\n        assert_array_equal(\n            mmcv.imtranslate(img, -1, 'vertical'), img_translated)\n        # offset=-2, horizontal\n        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        img_translated = [[3, 4, 128, 128], [7, 8, 128, 128]]\n        img_translated = np.stack(\n            [img_translated, img_translated, img_translated], axis=-1)\n        assert_array_equal(\n            mmcv.imtranslate(img, -2, border_value=128), img_translated)\n        # offset=2, vertical\n        border_value = (110, 120, 130)\n        img_translated = np.stack([\n            np.ones((2, 4)) * border_value[0],\n            np.ones((2, 4)) * border_value[1],\n            np.ones((2, 4)) * border_value[2]\n        ],\n                                  axis=-1).astype(np.uint8)\n        assert_array_equal(\n            mmcv.imtranslate(img, 2, 'vertical', border_value), img_translated)\n        # test invalid number elements in border_value\n        with pytest.raises(AssertionError):\n            mmcv.imtranslate(img, 1, border_value=(1, ))\n        # test invalid type of border_value\n        with pytest.raises(ValueError):\n            mmcv.imtranslate(img, 1, border_value=[1, 2, 3])\n        # test invalid value of direction\n        with pytest.raises(AssertionError):\n            mmcv.imtranslate(img, 1, 'diagonal')\n"
  },
  {
    "path": "tests/test_image/test_image_misc.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nfrom numpy.testing import assert_array_equal\n\nimport mmcv\n\ntry:\n    import torch\nexcept ImportError:\n    torch = None\n\n\n@pytest.mark.skipif(torch is None, reason='requires torch library')\ndef test_tensor2imgs():\n\n    # test tensor obj\n    with pytest.raises(AssertionError):\n        tensor = np.random.rand(2, 3, 3)\n        mmcv.tensor2imgs(tensor)\n\n    # test tensor ndim\n    with pytest.raises(AssertionError):\n        tensor = torch.randn(2, 3, 3)\n        mmcv.tensor2imgs(tensor)\n\n    # test tensor dim-1\n    with pytest.raises(AssertionError):\n        tensor = torch.randn(2, 4, 3, 3)\n        mmcv.tensor2imgs(tensor)\n\n    # test mean length\n    with pytest.raises(AssertionError):\n        tensor = torch.randn(2, 3, 5, 5)\n        mmcv.tensor2imgs(tensor, mean=(1, ))\n        tensor = torch.randn(2, 1, 5, 5)\n        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))\n\n    # test std length\n    with pytest.raises(AssertionError):\n        tensor = torch.randn(2, 3, 5, 5)\n        mmcv.tensor2imgs(tensor, std=(1, ))\n        tensor = torch.randn(2, 1, 5, 5)\n        mmcv.tensor2imgs(tensor, std=(1, 1, 1))\n\n    # test to_rgb\n    with pytest.raises(AssertionError):\n        tensor = torch.randn(2, 1, 5, 5)\n        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)\n\n    # test rgb=True\n    tensor = torch.randn(2, 3, 5, 5)\n    gts = [\n        t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8)\n        for t in tensor.flip(1)\n    ]\n    outputs = mmcv.tensor2imgs(tensor, to_rgb=True)\n    for gt, output in zip(gts, outputs):\n        assert_array_equal(gt, output)\n\n    # test rgb=False\n    tensor = torch.randn(2, 3, 5, 5)\n    gts = [t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8) for t in tensor]\n    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)\n    for gt, output in zip(gts, outputs):\n        assert_array_equal(gt, output)\n\n    # test tensor channel 1 and rgb=False\n    tensor = torch.randn(2, 1, 5, 5)\n    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]\n    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)\n    for gt, output in zip(gts, outputs):\n        assert_array_equal(gt, output)\n"
  },
  {
    "path": "tests/test_image/test_io.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport os.path as osp\nimport sys\nimport tempfile\nfrom pathlib import Path\nfrom unittest.mock import MagicMock, patch\n\nimport cv2\nimport mmengine\nimport numpy as np\nimport pytest\nimport torch\nfrom mmengine.fileio.file_client import HTTPBackend, PetrelBackend\nfrom numpy.testing import assert_allclose, assert_array_equal\n\nimport mmcv\n\nif torch.__version__ == 'parrots':\n    pytest.skip('not necessary in parrots test', allow_module_level=True)\n\n\nclass TestIO:\n\n    @classmethod\n    def setup_class(cls):\n        cls.data_dir = osp.join(osp.dirname(__file__), '../data')\n        # the test img resolution is 400x300\n        cls.img_path = osp.join(cls.data_dir, 'color.jpg')\n        cls.img_path_obj = Path(cls.img_path)\n        cls.gray_img_path = osp.join(cls.data_dir, 'grayscale.jpg')\n        cls.gray_img_path_obj = Path(cls.gray_img_path)\n        cls.gray_img_dim3_path = osp.join(cls.data_dir, 'grayscale_dim3.jpg')\n        cls.gray_alpha_img_path = osp.join(cls.data_dir, 'gray_alpha.png')\n        cls.palette_img_path = osp.join(cls.data_dir, 'palette.gif')\n        cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')\n        cls.img = cv2.imread(cls.img_path)\n        cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')\n        # petrel s3 path\n        cls.s3_path = 's3://path/of/your/file.jpg'\n        # http path\n        cls.http_path = 'http://path/of/your/file.jpg'\n        # add mock package\n        sys.modules['petrel_client'] = MagicMock()\n        sys.modules['petrel_client.client'] = MagicMock()\n\n    @classmethod\n    def teardown_class(cls):\n        # clean instances avoid to influence other unittest\n        mmengine.FileClient._instances = {}\n\n    def assert_img_equal(self, img, ref_img, ratio_thr=0.999):\n        assert img.shape == ref_img.shape\n        assert img.dtype == ref_img.dtype\n        area = ref_img.shape[0] * ref_img.shape[1]\n        diff = np.abs(img.astype('int32') - ref_img.astype('int32'))\n        assert np.sum(diff <= 1) / float(area) > ratio_thr\n\n    def test_imread(self):\n        # backend cv2\n        mmcv.use_backend('cv2')\n\n        # file_client_args and backend_args can not be both set\n        with pytest.raises(\n                ValueError,\n                match='\"file_client_args\" and \"backend_args\" cannot be set'):\n            mmcv.imread(\n                self.img_path,\n                file_client_args={'backend': 'disk'},\n                backend_args={'backend': 'disk'})\n\n        # HardDiskBackend\n        img_cv2_color_bgr = mmcv.imread(self.img_path)\n        assert img_cv2_color_bgr.shape == (300, 400, 3)\n        img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')\n        assert img_cv2_color_rgb.shape == (300, 400, 3)\n        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)\n        img_cv2_grayscale1 = mmcv.imread(self.img_path, 'grayscale')\n        assert img_cv2_grayscale1.shape == (300, 400)\n        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path)\n        assert img_cv2_grayscale2.shape == (300, 400, 3)\n        img_cv2_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')\n        assert img_cv2_unchanged.shape == (300, 400)\n        img_cv2_unchanged = mmcv.imread(img_cv2_unchanged)\n        assert_array_equal(img_cv2_unchanged, mmcv.imread(img_cv2_unchanged))\n\n        img_cv2_color_bgr = mmcv.imread(self.img_path_obj)\n        assert img_cv2_color_bgr.shape == (300, 400, 3)\n        img_cv2_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')\n        assert img_cv2_color_rgb.shape == (300, 400, 3)\n        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)\n        img_cv2_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')\n        assert img_cv2_grayscale1.shape == (300, 400)\n        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path_obj)\n        assert img_cv2_grayscale2.shape == (300, 400, 3)\n        img_cv2_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')\n        assert img_cv2_unchanged.shape == (300, 400)\n        with pytest.raises(TypeError):\n            mmcv.imread(1)\n\n        # PetrelBackend\n        img_cv2_color_bgr = mmcv.imread(self.img_path)\n        with patch.object(\n                PetrelBackend, 'get',\n                return_value=img_cv2_color_bgr) as mock_method:\n            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')\n            img_cv2_color_bgr_petrel_with_args = mmcv.imread(\n                self.s3_path,\n                backend='cv2',\n                file_client_args={'backend': 'petrel'})\n            mock_method.assert_called()\n            assert_array_equal(img_cv2_color_bgr_petrel,\n                               img_cv2_color_bgr_petrel_with_args)\n\n            mock_method.reset_mock()\n\n            img_cv2_color_bgr_petrel_with_args = mmcv.imread(\n                self.s3_path,\n                backend='cv2',\n                backend_args={'backend': 'petrel'})\n            mock_method.assert_called()\n            assert_array_equal(img_cv2_color_bgr_petrel,\n                               img_cv2_color_bgr_petrel_with_args)\n\n        # HTTPBackend\n        img_cv2_color_bgr = mmcv.imread(self.img_path)\n        with patch.object(\n                HTTPBackend, 'get',\n                return_value=img_cv2_color_bgr) as mock_method:\n            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')\n            img_cv2_color_bgr_http_with_args = mmcv.imread(\n                self.http_path,\n                backend='cv2',\n                file_client_args={'backend': 'http'})\n            mock_method.assert_called()\n            assert_array_equal(img_cv2_color_bgr_http,\n                               img_cv2_color_bgr_http_with_args)\n\n            mock_method.reset_mock()\n\n            img_cv2_color_bgr_http_with_args = mmcv.imread(\n                self.http_path,\n                backend='cv2',\n                backend_args={'backend': 'http'})\n            mock_method.assert_called()\n            assert_array_equal(img_cv2_color_bgr_http,\n                               img_cv2_color_bgr_http_with_args)\n\n        with pytest.raises(FileNotFoundError):\n            mmcv.imread('/not/exists/' + self.img_path)\n\n        # test arg backend pillow\n        img_pil_gray_alpha = mmcv.imread(\n            self.gray_alpha_img_path, 'grayscale', backend='pillow')\n        assert img_pil_gray_alpha.shape == (400, 500)\n        mean = img_pil_gray_alpha[300:, 400:].mean()\n        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)\n        img_pil_gray_alpha = mmcv.imread(\n            self.gray_alpha_img_path, backend='pillow')\n        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))\n        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)\n        assert img_pil_gray_alpha.shape == (400, 500, 3)\n        img_pil_gray_alpha = mmcv.imread(\n            self.gray_alpha_img_path, 'unchanged', backend='pillow')\n        assert img_pil_gray_alpha.shape == (400, 500, 2)\n        img_pil_palette = mmcv.imread(\n            self.palette_img_path, 'grayscale', backend='pillow')\n        assert img_pil_palette.shape == (300, 400)\n        img_pil_palette = mmcv.imread(self.palette_img_path, backend='pillow')\n        assert img_pil_palette.shape == (300, 400, 3)\n        img_pil_palette = mmcv.imread(\n            self.palette_img_path, 'unchanged', backend='pillow')\n        assert img_pil_palette.shape == (300, 400)\n\n        # backend pillow\n        mmcv.use_backend('pillow')\n        img_pil_grayscale1 = mmcv.imread(self.img_path, 'grayscale')\n        assert img_pil_grayscale1.shape == (300, 400)\n        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'grayscale')\n        assert img_pil_gray_alpha.shape == (400, 500)\n        mean = img_pil_gray_alpha[300:, 400:].mean()\n        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)\n        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path)\n        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))\n        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)\n        assert img_pil_gray_alpha.shape == (400, 500, 3)\n        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'unchanged')\n        assert img_pil_gray_alpha.shape == (400, 500, 2)\n        img_pil_palette = mmcv.imread(self.palette_img_path, 'grayscale')\n        assert img_pil_palette.shape == (300, 400)\n        img_pil_palette = mmcv.imread(self.palette_img_path)\n        assert img_pil_palette.shape == (300, 400, 3)\n        img_pil_palette = mmcv.imread(self.palette_img_path, 'unchanged')\n        assert img_pil_palette.shape == (300, 400)\n        img_pil_grayscale2 = mmcv.imread(self.gray_img_path)\n        assert img_pil_grayscale2.shape == (300, 400, 3)\n        img_pil_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')\n        assert img_pil_unchanged.shape == (300, 400)\n        img_pil_unchanged = mmcv.imread(img_pil_unchanged)\n        assert_array_equal(img_pil_unchanged, mmcv.imread(img_pil_unchanged))\n\n        img_pil_color_bgr = mmcv.imread(self.img_path_obj)\n        assert img_pil_color_bgr.shape == (300, 400, 3)\n        img_pil_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')\n        assert img_pil_color_rgb.shape == (300, 400, 3)\n        assert (img_pil_color_rgb == img_cv2_color_rgb).sum() / float(\n            img_cv2_color_rgb.size) > 0.5\n        assert_array_equal(img_pil_color_rgb[:, :, ::-1], img_pil_color_bgr)\n        img_pil_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')\n        assert img_pil_grayscale1.shape == (300, 400)\n        img_pil_grayscale2 = mmcv.imread(self.gray_img_path_obj)\n        assert img_pil_grayscale2.shape == (300, 400, 3)\n        img_pil_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')\n        assert img_pil_unchanged.shape == (300, 400)\n        with pytest.raises(TypeError):\n            mmcv.imread(1)\n\n        # backend turbojpeg\n        mmcv.use_backend('turbojpeg')\n\n        img_turbojpeg_color_bgr = mmcv.imread(self.img_path)\n        assert img_turbojpeg_color_bgr.shape == (300, 400, 3)\n        assert_array_equal(img_turbojpeg_color_bgr, img_cv2_color_bgr)\n\n        img_turbojpeg_color_rgb = mmcv.imread(\n            self.img_path, channel_order='rgb')\n        assert img_turbojpeg_color_rgb.shape == (300, 400, 3)\n        assert_array_equal(img_turbojpeg_color_rgb, img_cv2_color_rgb)\n\n        with pytest.raises(ValueError):\n            mmcv.imread(self.img_path, channel_order='unsupport_order')\n\n        img_turbojpeg_grayscale1 = mmcv.imread(self.img_path, flag='grayscale')\n        assert img_turbojpeg_grayscale1.shape == (300, 400)\n        assert_array_equal(img_turbojpeg_grayscale1, img_cv2_grayscale1)\n\n        img_turbojpeg_grayscale2 = mmcv.imread(self.gray_img_path)\n        assert img_turbojpeg_grayscale2.shape == (300, 400, 3)\n        assert_array_equal(img_turbojpeg_grayscale2, img_cv2_grayscale2)\n\n        img_turbojpeg_grayscale2 = mmcv.imread(img_turbojpeg_grayscale2)\n        assert_array_equal(img_turbojpeg_grayscale2,\n                           mmcv.imread(img_turbojpeg_grayscale2))\n\n        with pytest.raises(ValueError):\n            mmcv.imread(self.gray_img_path, 'unchanged')\n\n        with pytest.raises(TypeError):\n            mmcv.imread(1)\n\n        with pytest.raises(AssertionError):\n            mmcv.use_backend('unsupport_backend')\n\n        with pytest.raises(ValueError):\n            mmcv.imread(self.img_path, 'unsupported_backend')\n\n        # backend tifffile, multi channel tiff file(> 4 channels).\n        mmcv.use_backend('tifffile')\n        img_tifffile = mmcv.imread(self.tiff_path)\n        assert img_tifffile.shape == (200, 150, 5)\n\n        mmcv.use_backend('cv2')\n\n        # consistent exif behaviour\n        img_cv2_exif = mmcv.imread(self.exif_img_path)\n        img_pil_exif = mmcv.imread(self.exif_img_path, backend='pillow')\n        assert img_cv2_exif.shape == (400, 300, 3)\n        assert img_pil_exif.shape == (400, 300, 3)\n        img_cv2_exif_unchanged = mmcv.imread(\n            self.exif_img_path, flag='unchanged')\n        img_pil_exif_unchanged = mmcv.imread(\n            self.exif_img_path, backend='pillow', flag='unchanged')\n        assert img_cv2_exif_unchanged.shape == (300, 400, 3)\n        assert img_pil_exif_unchanged.shape == (300, 400, 3)\n        img_cv2_color_ignore_exif = mmcv.imread(\n            self.exif_img_path, flag='color_ignore_orientation')\n        img_pil_color_ignore_exif = mmcv.imread(\n            self.exif_img_path,\n            backend='pillow',\n            flag='color_ignore_orientation')\n        assert img_cv2_color_ignore_exif.shape == (300, 400, 3)\n        assert img_pil_color_ignore_exif.shape == (300, 400, 3)\n        img_cv2_grayscale_ignore_exif = mmcv.imread(\n            self.exif_img_path, flag='grayscale_ignore_orientation')\n        img_pil_grayscale_ignore_exif = mmcv.imread(\n            self.exif_img_path,\n            backend='pillow',\n            flag='grayscale_ignore_orientation')\n        assert img_cv2_grayscale_ignore_exif.shape == (300, 400)\n        assert img_pil_grayscale_ignore_exif.shape == (300, 400)\n\n    def test_imfrombytes(self):\n        # backend cv2, channel order: bgr\n        mmcv.use_backend('cv2')\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_cv2 = mmcv.imfrombytes(img_bytes)\n        assert img_cv2.shape == (300, 400, 3)\n\n        # backend cv2, channel order: rgb\n        mmcv.use_backend('cv2')\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_rgb_cv2 = mmcv.imfrombytes(img_bytes, channel_order='rgb')\n        assert img_rgb_cv2.shape == (300, 400, 3)\n        assert_array_equal(img_rgb_cv2, img_cv2[:, :, ::-1])\n\n        # backend cv2, grayscale, decode as 3 channels\n        with open(self.gray_img_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_rgb_cv2 = mmcv.imfrombytes(img_bytes)\n        assert gray_img_rgb_cv2.shape == (300, 400, 3)\n\n        # backend cv2, grayscale\n        with open(self.gray_img_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')\n        assert gray_img_cv2.shape == (300, 400)\n\n        # backend cv2, grayscale dim3\n        with open(self.gray_img_dim3_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_dim3_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')\n        assert gray_img_dim3_cv2.shape == (300, 400)\n\n        # arg backend pillow, channel order: bgr\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_pillow = mmcv.imfrombytes(img_bytes, backend='pillow')\n        assert img_pillow.shape == (300, 400, 3)\n        # Pillow and opencv decoding may not be the same\n        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5\n\n        # backend pillow, channel order: bgr\n        mmcv.use_backend('pillow')\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_pillow = mmcv.imfrombytes(img_bytes)\n        assert img_pillow.shape == (300, 400, 3)\n        # Pillow and opencv decoding may not be the same\n        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5\n\n        # backend turbojpeg, channel order: bgr\n        mmcv.use_backend('turbojpeg')\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_turbojpeg = mmcv.imfrombytes(img_bytes)\n        assert img_turbojpeg.shape == (300, 400, 3)\n        assert_array_equal(img_cv2, img_turbojpeg)\n\n        # backend turbojpeg, channel order: rgb\n        with open(self.img_path, 'rb') as f:\n            img_bytes = f.read()\n        img_rgb_turbojpeg = mmcv.imfrombytes(img_bytes, channel_order='rgb')\n        assert img_rgb_turbojpeg.shape == (300, 400, 3)\n        assert_array_equal(img_rgb_turbojpeg, img_cv2[:, :, ::-1])\n\n        # backend turbojpeg, grayscale, decode as 3 channels\n        with open(self.gray_img_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes)\n        assert gray_img_turbojpeg.shape == (300, 400, 3)\n        assert_array_equal(gray_img_rgb_cv2, gray_img_turbojpeg)\n\n        # backend turbojpeg, grayscale\n        with open(self.gray_img_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')\n        assert gray_img_turbojpeg.shape == (300, 400)\n        assert_array_equal(gray_img_cv2, gray_img_turbojpeg)\n\n        # backend turbojpeg, grayscale dim3\n        with open(self.gray_img_dim3_path, 'rb') as f:\n            img_bytes = f.read()\n        gray_img_dim3_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')\n        assert gray_img_dim3_turbojpeg.shape == (300, 400)\n        assert_array_equal(gray_img_dim3_cv2, gray_img_dim3_turbojpeg)\n\n        mmcv.use_backend('cv2')\n\n        with pytest.raises(ValueError):\n            with open(self.img_path, 'rb') as f:\n                img_bytes = f.read()\n            mmcv.imfrombytes(img_bytes, backend='unsupported_backend')\n\n    def test_imwrite(self):\n        img = mmcv.imread(self.img_path)\n        out_file = osp.join(tempfile.gettempdir(), 'mmcv_test.jpg')\n\n        # file_client_args and backend_args can not be both set\n        with pytest.raises(\n                ValueError,\n                match='\"file_client_args\" and \"backend_args\" cannot be set'):\n            mmcv.imwrite(\n                img,\n                out_file,\n                file_client_args={'backend': 'disk'},\n                backend_args={'backend': 'disk'})\n\n        mmcv.imwrite(img, out_file)\n        rewrite_img = mmcv.imread(out_file)\n        os.remove(out_file)\n        self.assert_img_equal(img, rewrite_img)\n\n        # test petrel client\n        with patch.object(\n                PetrelBackend, 'put', return_value=None) as mock_method:\n            ret = mmcv.imwrite(img, self.s3_path)\n            ret_with_args = mmcv.imwrite(\n                img, self.s3_path, file_client_args={'backend': 'petrel'})\n            assert ret\n            assert ret_with_args\n            mock_method.assert_called()\n\n            mock_method.reset_mock()\n\n            ret_with_args = mmcv.imwrite(\n                img, self.s3_path, backend_args={'backend': 'petrel'})\n            assert ret_with_args\n            mock_method.assert_called()\n\n        with pytest.raises(cv2.error):\n            mmcv.imwrite(img, 'error_file.jppg')\n\n    @patch('mmcv.image.io.TurboJPEG', None)\n    def test_no_turbojpeg(self):\n        with pytest.raises(ImportError):\n            mmcv.use_backend('turbojpeg')\n\n        mmcv.use_backend('cv2')\n\n    @patch('mmcv.image.io.Image', None)\n    def test_no_pillow(self):\n        with pytest.raises(ImportError):\n            mmcv.use_backend('pillow')\n\n        mmcv.use_backend('cv2')\n"
  },
  {
    "path": "tests/test_image/test_photometric.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os.path as osp\n\nimport cv2\nimport numpy as np\nimport pytest\nfrom numpy.testing import assert_array_equal\n\nimport mmcv\n\n\nclass TestPhotometric:\n\n    @classmethod\n    def setup_class(cls):\n        # the test img resolution is 400x300\n        cls.img_path = osp.join(osp.dirname(__file__), '../data/color.jpg')\n        cls.img = cv2.imread(cls.img_path)\n        cls.mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)\n        cls.std = np.array([58.395, 57.12, 57.375], dtype=np.float32)\n\n    def test_imnormalize(self):\n        rgb_img = self.img[:, :, ::-1]\n        baseline = (rgb_img - self.mean) / self.std\n        img = mmcv.imnormalize(self.img, self.mean, self.std)\n        assert np.allclose(img, baseline)\n        assert id(img) != id(self.img)\n        img = mmcv.imnormalize(rgb_img, self.mean, self.std, to_rgb=False)\n        assert np.allclose(img, baseline)\n        assert id(img) != id(rgb_img)\n\n    def test_imnormalize_(self):\n        img_for_normalize = np.float32(self.img)\n        rgb_img_for_normalize = np.float32(self.img[:, :, ::-1])\n        baseline = (rgb_img_for_normalize - self.mean) / self.std\n        img = mmcv.imnormalize_(img_for_normalize, self.mean, self.std)\n        assert np.allclose(img_for_normalize, baseline)\n        assert id(img) == id(img_for_normalize)\n        img = mmcv.imnormalize_(\n            rgb_img_for_normalize, self.mean, self.std, to_rgb=False)\n        assert np.allclose(img, baseline)\n        assert id(img) == id(rgb_img_for_normalize)\n\n    def test_imdenormalize(self):\n        norm_img = (self.img[:, :, ::-1] - self.mean) / self.std\n        rgb_baseline = (norm_img * self.std + self.mean)\n        bgr_baseline = rgb_baseline[:, :, ::-1]\n        img = mmcv.imdenormalize(norm_img, self.mean, self.std)\n        assert np.allclose(img, bgr_baseline)\n        img = mmcv.imdenormalize(norm_img, self.mean, self.std, to_bgr=False)\n        assert np.allclose(img, rgb_baseline)\n\n    def test_iminvert(self):\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img_r = np.array([[255, 127, 0], [254, 128, 1], [253, 126, 2]],\n                         dtype=np.uint8)\n        assert_array_equal(mmcv.iminvert(img), img_r)\n\n    def test_solarize(self):\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img_r = np.array([[0, 127, 0], [1, 127, 1], [2, 126, 2]],\n                         dtype=np.uint8)\n        assert_array_equal(mmcv.solarize(img), img_r)\n        img_r = np.array([[0, 127, 0], [1, 128, 1], [2, 126, 2]],\n                         dtype=np.uint8)\n        assert_array_equal(mmcv.solarize(img, 100), img_r)\n\n    def test_posterize(self):\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img_r = np.array([[0, 128, 128], [0, 0, 128], [0, 128, 128]],\n                         dtype=np.uint8)\n        assert_array_equal(mmcv.posterize(img, 1), img_r)\n        img_r = np.array([[0, 128, 224], [0, 96, 224], [0, 128, 224]],\n                         dtype=np.uint8)\n        assert_array_equal(mmcv.posterize(img, 3), img_r)\n\n    def test_adjust_color(self, nb_rand_test=100):\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        assert_array_equal(mmcv.adjust_color(img), img)\n        img_gray = mmcv.bgr2gray(img)\n        img_r = np.stack([img_gray, img_gray, img_gray], axis=-1)\n        assert_array_equal(mmcv.adjust_color(img, 0), img_r)\n        assert_array_equal(mmcv.adjust_color(img, 0, 1), img_r)\n        assert_array_equal(\n            mmcv.adjust_color(img, 0.5, 0.5),\n            np.round(np.clip((img * 0.5 + img_r * 0.5), 0,\n                             255)).astype(img.dtype))\n        assert_array_equal(\n            mmcv.adjust_color(img, 1, 1.5),\n            np.round(np.clip(img * 1 + img_r * 1.5, 0, 255)).astype(img.dtype))\n        assert_array_equal(\n            mmcv.adjust_color(img, 0.8, -0.6, gamma=2),\n            np.round(np.clip(img * 0.8 - 0.6 * img_r + 2, 0,\n                             255)).astype(img.dtype))\n        assert_array_equal(\n            mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6),\n            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0,\n                             255)).astype(img.dtype))\n\n        # test float type of image\n        img = img.astype(np.float32)\n        assert_array_equal(\n            np.round(mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6)),\n            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0, 255)))\n\n        # test equalize with randomly sampled image.\n        for _ in range(nb_rand_test):\n            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,\n                          255).astype(np.uint8)\n            factor = np.random.uniform()\n            cv2_img = mmcv.adjust_color(img, alpha=factor)\n            pil_img = mmcv.adjust_color(img, alpha=factor, backend='pillow')\n            np.testing.assert_allclose(cv2_img, pil_img, rtol=0, atol=2)\n\n        # the input type must be uint8 for pillow backend\n        with pytest.raises(AssertionError):\n            mmcv.adjust_color(img.astype(np.float32), backend='pillow')\n\n        # backend must be 'cv2' or 'pillow'\n        with pytest.raises(ValueError):\n            mmcv.adjust_color(img.astype(np.uint8), backend='not support')\n\n    def test_imequalize(self, nb_rand_test=100):\n\n        def _imequalize(img):\n            # equalize the image using PIL.ImageOps.equalize\n            from PIL import Image, ImageOps\n            img = Image.fromarray(img)\n            equalized_img = np.asarray(ImageOps.equalize(img))\n            return equalized_img\n\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        equalized_img = mmcv.imequalize(img)\n        assert_array_equal(equalized_img, _imequalize(img))\n\n        # test equalize with case step=0\n        img = np.array([[0, 0, 0], [120, 120, 120], [255, 255, 255]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        assert_array_equal(mmcv.imequalize(img), img)\n\n        # test equalize with randomly sampled image.\n        for _ in range(nb_rand_test):\n            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,\n                          255).astype(np.uint8)\n            equalized_img = mmcv.imequalize(img)\n            assert_array_equal(equalized_img, _imequalize(img))\n\n    def test_adjust_brightness(self, nb_rand_test=100):\n\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        # test case with factor 1.0\n        assert_array_equal(mmcv.adjust_brightness(img, 1.), img)\n        # test case with factor 0.0\n        assert_array_equal(mmcv.adjust_brightness(img, 0.), np.zeros_like(img))\n        # test adjust_brightness with randomly sampled images and factors.\n        for _ in range(nb_rand_test):\n            img = np.clip(\n                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,\n                255).astype(np.uint8)\n            factor = np.random.uniform() + np.random.choice([0, 1])\n            np.testing.assert_allclose(\n                mmcv.adjust_brightness(img, factor).astype(np.int32),\n                mmcv.adjust_brightness(img, factor,\n                                       backend='pillow').astype(np.int32),\n                rtol=0,\n                atol=1)\n\n        # the input type must be uint8 for pillow backend\n        with pytest.raises(AssertionError):\n            mmcv.adjust_brightness(img.astype(np.float32), backend='pillow')\n\n        # backend must be 'cv2' or 'pillow'\n        with pytest.raises(ValueError):\n            mmcv.adjust_brightness(img.astype(np.uint8), backend='not support')\n\n    def test_adjust_contrast(self, nb_rand_test=100):\n\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n        # test case with factor 1.0\n        assert_array_equal(mmcv.adjust_contrast(img, 1.), img)\n        # test case with factor 0.0\n        assert_array_equal(\n            mmcv.adjust_contrast(img, 0.),\n            mmcv.adjust_contrast(img, 0., backend='pillow'))\n        # test adjust_contrast with randomly sampled images and factors.\n        for _ in range(nb_rand_test):\n            img = np.clip(\n                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,\n                255).astype(np.uint8)\n            factor = np.random.uniform() + np.random.choice([0, 1])\n            # Note the gap (less_equal 1) between PIL.ImageEnhance.Contrast\n            # and mmcv.adjust_contrast comes from the gap that converts from\n            # a color image to gray image using mmcv or PIL.\n            np.testing.assert_allclose(\n                mmcv.adjust_contrast(img, factor).astype(np.int32),\n                mmcv.adjust_contrast(img, factor,\n                                     backend='pillow').astype(np.int32),\n                rtol=0,\n                atol=1)\n\n        # the input type must be uint8 pillow backend\n        with pytest.raises(AssertionError):\n            mmcv.adjust_contrast(img.astype(np.float32), backend='pillow')\n\n        # backend must be 'cv2' or 'pillow'\n        with pytest.raises(ValueError):\n            mmcv.adjust_contrast(img.astype(np.uint8), backend='not support')\n\n    def test_auto_contrast(self, nb_rand_test=100):\n\n        def _auto_contrast(img, cutoff=0):\n            from PIL import Image\n            from PIL.ImageOps import autocontrast\n\n            # Image.fromarray defaultly supports RGB, not BGR.\n            # convert from BGR to RGB\n            img = Image.fromarray(img[..., ::-1], mode='RGB')\n            contrasted_img = autocontrast(img, cutoff)\n            # convert from RGB to BGR\n            return np.asarray(contrasted_img)[..., ::-1]\n\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n\n        # test case without cut-off\n        assert_array_equal(mmcv.auto_contrast(img), _auto_contrast(img))\n        # test case with cut-off as int\n        assert_array_equal(\n            mmcv.auto_contrast(img, 10), _auto_contrast(img, 10))\n        # test case with cut-off as float\n        assert_array_equal(\n            mmcv.auto_contrast(img, 12.5), _auto_contrast(img, 12.5))\n        # test case with cut-off as tuple\n        assert_array_equal(\n            mmcv.auto_contrast(img, (10, 10)), _auto_contrast(img, 10))\n        # test case with cut-off with sum over 100\n        assert_array_equal(\n            mmcv.auto_contrast(img, 60), _auto_contrast(img, 60))\n\n        # test auto_contrast with randomly sampled images and factors.\n        for _ in range(nb_rand_test):\n            img = np.clip(\n                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,\n                255).astype(np.uint8)\n            # cut-offs are not set as tuple since in `build.yml`, pillow 6.2.2\n            # is installed, which does not support setting low cut-off and high\n            #  cut-off differently.\n            # With pillow above 8.0.0, cutoff can be set as tuple\n            cutoff = np.random.rand() * 100\n            assert_array_equal(\n                mmcv.auto_contrast(img, cutoff), _auto_contrast(img, cutoff))\n\n    def test_adjust_sharpness(self, nb_rand_test=100):\n\n        def _adjust_sharpness(img, factor):\n            # adjust the sharpness of image using\n            # PIL.ImageEnhance.Sharpness\n            from PIL import Image\n            from PIL.ImageEnhance import Sharpness\n            img = Image.fromarray(img)\n            sharpened_img = Sharpness(img).enhance(factor)\n            return np.asarray(sharpened_img)\n\n        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],\n                       dtype=np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n\n        # test case with invalid type of kernel\n        with pytest.raises(AssertionError):\n            mmcv.adjust_sharpness(img, 1., kernel=1.)\n        # test case with invalid shape of kernel\n        kernel = np.ones((3, 3, 3))\n        with pytest.raises(AssertionError):\n            mmcv.adjust_sharpness(img, 1., kernel=kernel)\n        # test case with all-zero kernel, factor 0.0\n        kernel = np.zeros((3, 3))\n        assert_array_equal(\n            mmcv.adjust_sharpness(img, 0., kernel=kernel), np.zeros_like(img))\n\n        # test case with factor 1.0\n        assert_array_equal(mmcv.adjust_sharpness(img, 1.), img)\n        # test adjust_sharpness with randomly sampled images and factors.\n        for _ in range(nb_rand_test):\n            img = np.clip(\n                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,\n                255).astype(np.uint8)\n            factor = np.random.uniform()\n            # Note the gap between PIL.ImageEnhance.Sharpness and\n            # mmcv.adjust_sharpness mainly comes from the difference ways of\n            # handling img edges when applying filters\n            np.testing.assert_allclose(\n                mmcv.adjust_sharpness(img, factor).astype(np.int32)[1:-1,\n                                                                    1:-1],\n                _adjust_sharpness(img, factor).astype(np.int32)[1:-1, 1:-1],\n                rtol=0,\n                atol=1)\n\n    def test_adjust_lighting(self):\n        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)\n        img = np.stack([img, img, img], axis=-1)\n\n        # eigval and eigvec must be np.ndarray\n        with pytest.raises(AssertionError):\n            mmcv.adjust_lighting(img, 1, np.ones((3, 1)))\n        with pytest.raises(AssertionError):\n            mmcv.adjust_lighting(img, np.array([1]), (1, 1, 1))\n        # we must have the same number of eigval and eigvec\n        with pytest.raises(AssertionError):\n            mmcv.adjust_lighting(img, np.array([1]), np.eye(2))\n        with pytest.raises(AssertionError):\n            mmcv.adjust_lighting(img, np.array([1]), np.array([1]))\n\n        img_adjusted = mmcv.adjust_lighting(\n            img,\n            np.random.normal(0, 1, 2),\n            np.random.normal(0, 1, (3, 2)),\n            alphastd=0.)\n        assert_array_equal(img_adjusted, img)\n\n    def test_lut_transform(self):\n        lut_table = np.array(list(range(256)))\n\n        # test assertion image values should between 0 and 255.\n        with pytest.raises(AssertionError):\n            mmcv.lut_transform(np.array([256]), lut_table)\n        with pytest.raises(AssertionError):\n            mmcv.lut_transform(np.array([-1]), lut_table)\n\n        # test assertion lut_table should be ndarray with shape (256, )\n        with pytest.raises(AssertionError):\n            mmcv.lut_transform(np.array([0]), list(range(256)))\n        with pytest.raises(AssertionError):\n            mmcv.lut_transform(np.array([1]), np.array(list(range(257))))\n\n        img = mmcv.lut_transform(self.img, lut_table)\n        baseline = cv2.LUT(self.img, lut_table)\n        assert np.allclose(img, baseline)\n\n        input_img = np.array(\n            [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],\n            dtype=float)\n        img = mmcv.lut_transform(input_img, lut_table)\n        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)\n        assert np.allclose(img, baseline)\n\n        input_img = np.random.randint(0, 256, size=(7, 8, 9, 10, 11))\n        img = mmcv.lut_transform(input_img, lut_table)\n        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)\n        assert np.allclose(img, baseline)\n\n    def test_clahe(self):\n\n        def _clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):\n            clahe = cv2.createCLAHE(clip_limit, tile_grid_size)\n            return clahe.apply(np.array(img, dtype=np.uint8))\n\n        # test assertion image should have the right shape\n        with pytest.raises(AssertionError):\n            mmcv.clahe(self.img)\n\n        # test assertion tile_grid_size should be a tuple with 2 integers\n        with pytest.raises(AssertionError):\n            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8.0, 8.0))\n        with pytest.raises(AssertionError):\n            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8, 8, 8))\n        with pytest.raises(AssertionError):\n            mmcv.clahe(self.img[:, :, 0], tile_grid_size=[8, 8])\n\n        # test with different channels\n        for i in range(self.img.shape[-1]):\n            img = mmcv.clahe(self.img[:, :, i])\n            img_std = _clahe(self.img[:, :, i])\n            assert np.allclose(img, img_std)\n            assert id(img) != id(self.img[:, :, i])\n            assert id(img_std) != id(self.img[:, :, i])\n\n        # test case with clip_limit=1.2\n        for i in range(self.img.shape[-1]):\n            img = mmcv.clahe(self.img[:, :, i], 1.2)\n            img_std = _clahe(self.img[:, :, i], 1.2)\n            assert np.allclose(img, img_std)\n            assert id(img) != id(self.img[:, :, i])\n            assert id(img_std) != id(self.img[:, :, i])\n\n    def test_adjust_hue(self):\n        # test case with img is not ndarray\n        from PIL import Image\n        pil_img = Image.fromarray(self.img)\n\n        with pytest.raises(TypeError):\n            mmcv.adjust_hue(pil_img, hue_factor=0.0)\n\n        # test case with hue_factor > 0.5 or hue_factor < -0.5\n        with pytest.raises(ValueError):\n            mmcv.adjust_hue(self.img, hue_factor=-0.6)\n        with pytest.raises(ValueError):\n            mmcv.adjust_hue(self.img, hue_factor=0.6)\n\n        for i in np.arange(-0.5, 0.5, 0.2):\n            pil_res = mmcv.adjust_hue(self.img, hue_factor=i, backend='pillow')\n            pil_res = np.array(pil_res)\n            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)\n            assert np.allclose(pil_res, cv2_res, atol=10.0)\n\n        # test pillow backend\n        with pytest.raises(AssertionError):\n            mmcv.adjust_hue(\n                self.img.astype(np.float32), hue_factor=0, backend='pillow')\n\n        # backend must be 'cv2' or 'pillow'\n        with pytest.raises(ValueError):\n            mmcv.adjust_hue(\n                self.img.astype(np.uint8), hue_factor=0, backend='not support')\n"
  },
  {
    "path": "tests/test_ops/test_active_rotated_filter.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import active_rotated_filter\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\nnp_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],\n                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],\n                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],\n                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],\n                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],\n                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],\n                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],\n                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],\n                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],\n                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],\n                          [9.5822e-01, 2.2260e-01, -2.4450e-01],\n                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],\n                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],\n                          [1.0169e+00, 4.7033e-01, -1.0875e+00],\n                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],\n                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],\n                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],\n                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],\n                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],\n                          [5.6248e-01, -5.1189e-01, 1.3614e+00],\n                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],\n                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],\n                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],\n                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])\n\nnp_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],\n                         [3, 6, 9, 8, 7, 4, 1, 2]],\n                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],\n                         [6, 9, 8, 7, 4, 1, 2, 3]],\n                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],\n                         [9, 8, 7, 4, 1, 2, 3, 6]]]])\n\nexpected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],\n                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],\n                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],\n                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],\n                              [1.4808e+00, -1.1463e+00, -1.6241e-01],\n                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],\n                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],\n                              [7.6572e-01, -1.1463e+00, 1.1341e+00],\n                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],\n                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],\n                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],\n                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],\n                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],\n                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],\n                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],\n                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],\n                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],\n                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],\n                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],\n                              [1.1341e+00, -1.1463e+00, 7.6572e-01],\n                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],\n                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],\n                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],\n                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],\n                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],\n                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],\n                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],\n                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],\n                              [9.8669e-01, 7.6566e-01, 2.5834e-02],\n                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],\n                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],\n                              [4.1356e-01, 7.6566e-01, 6.2799e-01],\n                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],\n                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],\n                              [6.1068e-01, 7.6566e-01, 1.9370e+00],\n                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],\n                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],\n                              [1.0015e+00, 7.6566e-01, -1.4242e+00],\n                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],\n                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],\n                              [2.5834e-02, 7.6566e-01, 9.8669e-01],\n                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],\n                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],\n                              [6.2799e-01, 7.6566e-01, 4.1356e-01],\n                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],\n                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],\n                              [1.9370e+00, 7.6566e-01, 6.1068e-01],\n                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],\n                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],\n                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],\n                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],\n                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],\n                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],\n                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],\n                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],\n                              [1.0658e+00, 7.5942e-01, 1.4960e+00],\n                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],\n                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],\n                              [3.8311e-01, 7.5942e-01, 1.4565e+00],\n                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],\n                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],\n                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],\n                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],\n                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],\n                              [2.4339e-01, 7.5942e-01, -1.7077e+00],\n                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],\n                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],\n                              [1.4960e+00, 7.5942e-01, 1.0658e+00],\n                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],\n                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],\n                              [1.4565e+00, 7.5942e-01, 3.8311e-01],\n                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],\n                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],\n                              [9.5822e-01, 2.2260e-01, -2.4450e-01],\n                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],\n                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],\n                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],\n                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],\n                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],\n                              [7.0902e-02, 2.2260e-01, 1.0904e+00],\n                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],\n                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],\n                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],\n                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],\n                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],\n                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],\n                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],\n                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],\n                              [2.4356e+00, 2.2260e-01, -1.5078e+00],\n                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],\n                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],\n                              [1.0904e+00, 2.2260e-01, 7.0902e-02],\n                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],\n                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],\n                              [8.4734e-01, 2.2260e-01, -1.5921e+00],\n                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],\n                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],\n                              [1.0169e+00, 4.7033e-01, -1.0875e+00],\n                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],\n                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],\n                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],\n                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],\n                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],\n                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],\n                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],\n                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],\n                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],\n                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],\n                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],\n                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],\n                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],\n                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],\n                              [1.8888e+00, 4.7033e-01, -1.0736e+00],\n                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],\n                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],\n                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],\n                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],\n                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],\n                              [2.1173e+00, 4.7033e-01, -2.8733e-01],\n                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],\n                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],\n                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],\n                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],\n                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],\n                              [1.6552e-03, -4.3128e-01, -1.5826e+00],\n                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],\n                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],\n                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],\n                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],\n                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],\n                              [2.6639e-01, -4.3128e-01, -5.6433e-01],\n                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],\n                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],\n                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],\n                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],\n                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],\n                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],\n                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],\n                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],\n                              [1.5835e+00, -4.3128e-01, -1.7292e+00],\n                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],\n                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],\n                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],\n                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],\n                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],\n                              [5.6248e-01, -5.1189e-01, 1.3614e+00],\n                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],\n                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],\n                              [3.3680e-01, -5.1189e-01, -1.9329e+00],\n                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],\n                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],\n                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],\n                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],\n                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],\n                              [5.0592e-01, -5.1189e-01, -1.2951e-01],\n                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],\n                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],\n                              [1.3614e+00, -5.1189e-01, 5.6248e-01],\n                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],\n                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],\n                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],\n                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],\n                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],\n                              [1.3493e+00, -5.1189e-01, -8.7148e-01],\n                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],\n                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],\n                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],\n                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],\n                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],\n                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],\n                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],\n                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],\n                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],\n                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],\n                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],\n                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],\n                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],\n                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],\n                              [3.0827e-01, 4.7821e-01, 1.6781e-02],\n                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],\n                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],\n                              [1.5342e+00, 4.7821e-01, -1.0764e+00],\n                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],\n                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],\n                              [1.2060e+00, 4.7821e-01, -4.4542e-01],\n                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],\n                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],\n                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],\n                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],\n                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],\n                              [1.6781e-02, 4.7821e-01, 3.0827e-01],\n                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])\n\nexpected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],\n                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])\n\n\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_active_rotated_filter(device):\n    feature = torch.tensor(\n        np_feature, dtype=torch.float, device=device, requires_grad=True)\n    indices = torch.tensor(np_indices, dtype=torch.int, device=device)\n    output = active_rotated_filter(feature, indices)\n    output.backward(torch.ones_like(output))\n    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)\n    assert np.allclose(\n        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_assign_score_withk.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import assign_score_withk\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_paconv_assign_scores(device):\n    scores = torch.tensor(\n        [[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],\n           [0.7595994, 0.97220325], [0.519155, 0.766185]],\n          [[0.15348864, 0.6051019], [0.21510637, 0.31916398],\n           [0.00236845, 0.5842595], [0.6783676, 0.5216348]]],\n         [[[0.23089725, 0.5568468], [0.7405102, 0.06438422],\n           [0.6887394, 0.22089851], [0.0502342, 0.79228795]],\n          [[0.44883424, 0.15427643], [0.13817799, 0.34856772],\n           [0.7989621, 0.33788306], [0.15699774, 0.7693662]]]],\n        device=device).float()\n    points = torch.tensor(\n        [[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],\n           [0.53563064, 0.23129565, 0.92366195, 0.44261628]],\n          [[0.5770022, 0.56625944, 0.23560429, 0.11178821],\n           [0.7735967, 0.95678777, 0.25468266, 0.02895975]],\n          [[0.0589869, 0.09017515, 0.5977862, 0.02797985],\n           [0.603862, 0.35991007, 0.85761684, 0.3096559]],\n          [[0.22359002, 0.13983732, 0.5544243, 0.68863827],\n           [0.85646236, 0.75651926, 0.8638947, 0.83600986]],\n          [[0.45424145, 0.27458847, 0.6456112, 0.47162914],\n           [0.15773582, 0.47645122, 0.79964715, 0.3323908]],\n          [[0.8351399, 0.84696376, 0.9431732, 0.29418713],\n           [0.77168906, 0.6996871, 0.19354361, 0.03392768]],\n          [[0.30976456, 0.7074133, 0.581795, 0.976677],\n           [0.69656056, 0.07199162, 0.4708506, 0.29117996]],\n          [[0.5829035, 0.30201727, 0.76556486, 0.0935446],\n           [0.88030535, 0.16129416, 0.9242525, 0.49545723]]],\n         [[[0.50899494, 0.06482804, 0.44939405, 0.37704808],\n           [0.47028124, 0.11969638, 0.62823206, 0.28560323]],\n          [[0.40690207, 0.689753, 0.51636654, 0.23040164],\n           [0.06935787, 0.00488842, 0.22462702, 0.09182382]],\n          [[0.26611632, 0.00184339, 0.7730655, 0.5228131],\n           [0.87776035, 0.77895886, 0.2787183, 0.16620636]],\n          [[0.502574, 0.04039001, 0.5368497, 0.98379374],\n           [0.40973026, 0.3238272, 0.9733018, 0.13988364]],\n          [[0.04586202, 0.20983845, 0.20662665, 0.22270602],\n           [0.60387236, 0.5155574, 0.51237285, 0.6528438]],\n          [[0.45735973, 0.86821306, 0.61054605, 0.8370336],\n           [0.45193362, 0.3734138, 0.7825672, 0.5699416]],\n          [[0.44591594, 0.12447512, 0.09282011, 0.7055254],\n           [0.25223452, 0.46696228, 0.7051136, 0.892151]],\n          [[0.49615085, 0.47321403, 0.93138885, 0.7652197],\n           [0.38766378, 0.30332977, 0.23131835, 0.02863514]]]],\n        device=device).float()\n    centers = torch.tensor(\n        [[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],\n           [0.45035273, 0.8768925, 0.977736, 0.54547966]],\n          [[0.01041394, 0.597893, 0.36212963, 0.4410367],\n           [0.94879234, 0.8372817, 0.21237361, 0.67945415]],\n          [[0.5096087, 0.26401454, 0.60034937, 0.5417416],\n           [0.87591463, 0.546456, 0.4096033, 0.16373193]],\n          [[0.79547447, 0.1482386, 0.12840575, 0.45384115],\n           [0.5640288, 0.944541, 0.5745328, 0.73229736]],\n          [[0.93011934, 0.7406011, 0.62621707, 0.8677915],\n           [0.91563636, 0.3595413, 0.6678378, 0.6085383]],\n          [[0.22431666, 0.65617776, 0.7483924, 0.6263364],\n           [0.30968404, 0.78204364, 0.14899081, 0.09628749]],\n          [[0.73675203, 0.72104895, 0.4648038, 0.6101647],\n           [0.7817645, 0.16572917, 0.3311919, 0.43407398]],\n          [[0.8193154, 0.09559608, 0.05978829, 0.90262103],\n           [0.4256065, 0.8165596, 0.8206446, 0.6604721]]],\n         [[[0.7159653, 0.18600845, 0.21433902, 0.3159626],\n           [0.3921569, 0.33221376, 0.5061177, 0.7961841]],\n          [[0.95338356, 0.04785997, 0.67185795, 0.6538394],\n           [0.4729132, 0.33404195, 0.17750603, 0.8445621]],\n          [[0.6755793, 0.16193843, 0.75943846, 0.92123103],\n           [0.2781859, 0.03114432, 0.710638, 0.52729136]],\n          [[0.8376105, 0.10858494, 0.13208169, 0.365772],\n           [0.5930795, 0.27390373, 0.14036089, 0.170403]],\n          [[0.3479789, 0.89855295, 0.04844379, 0.9871029],\n           [0.29781651, 0.0244137, 0.9179047, 0.8081611]],\n          [[0.12460887, 0.44991326, 0.19382608, 0.35037738],\n           [0.2773472, 0.4362057, 0.36757517, 0.5993509]],\n          [[0.29630446, 0.90046406, 0.5417113, 0.13510644],\n           [0.09623539, 0.04226565, 0.32001644, 0.44358212]],\n          [[0.5274848, 0.82096446, 0.9415489, 0.7123748],\n           [0.7537517, 0.8086482, 0.85345286, 0.7472754]]]],\n        device=device).float()\n    if device == 'cuda' or device == 'musa':\n        points.requires_grad_()\n        scores.requires_grad_()\n        centers.requires_grad_()\n    knn_idx = torch.tensor(\n        [[[6, 7, 4, 6], [2, 4, 2, 4]], [[7, 1, 3, 2], [6, 0, 2, 6]]],\n        device=device).long()\n    aggregate = 'sum'\n    expected_output = torch.tensor(\n        [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],\n           [-0.23378491, -0.24112664, -0.1600166, -0.4121864]],\n          [[-0.05780616, -0.12298299, -0.0370461, -0.07889931],\n           [-0.13956165, -0.02006848, -0.10940295, -0.0293439]],\n          [[0.09284145, 0.58250105, 0.5927749, 0.16774094],\n           [0.27070042, 0.13422406, 0.2617501, 0.23416464]],\n          [[-0.06121218, -0.09561322, -0.20408826, 0.08079343],\n           [0.00944228, 0.03874819, 0.08404065, 0.04041629]]],\n         [[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],\n           [0.09121774, 0.15976946, 0.23994486, 0.14350912]],\n          [[-0.36167958, -0.14891288, -0.64470863, -0.0646704],\n           [-0.28276974, -0.08847666, -0.46904767, 0.20491874]],\n          [[-0.34877953, -0.35533834, -0.25225785, -0.4638189],\n           [-0.1420663, 0.09467781, 0.17088932, 0.22580585]],\n          [[-0.3879708, -0.3991068, 0.05276498, -0.46989647],\n           [0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()\n\n    # test forward\n    output = assign_score_withk(scores, points, centers, knn_idx, aggregate)\n    assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)\n\n    # test backward\n    if device == 'cuda' or device == 'musa':\n        loss = output.sum()\n        loss.backward()\n        expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],\n                                               [-0.78873926, 0.7485497],\n                                               [-0.6866992, 0.05346543],\n                                               [0.04288036, -0.18217683]],\n                                              [[-1.1407862, 0.13533896],\n                                               [-0.06964391, -0.22948086],\n                                               [-1.1407862, 0.13533896],\n                                               [-0.06964391, -0.22948086]]],\n                                             [[[-0.3363995, -2.212181],\n                                               [-1.1589496, -2.7724311],\n                                               [-0.9387654, -1.3163853],\n                                               [-1.4385346, -1.0614843]],\n                                              [[-0.5048497, 1.4143617],\n                                               [-0.47332114, 0.6017133],\n                                               [-0.30974793, 1.1995442],\n                                               [-0.5048497,\n                                                1.4143617]]]]).float()\n        expected_points_grad = torch.tensor(\n            [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0.15585709, 0.15585709, 0.15585709, 0.15585709],\n               [1.1893613, 1.1893613, 1.1893613, 1.1893613]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[1.6530733, 1.6530733, 1.6530733, 1.6530733],\n               [1.8130021, 1.8130021, 1.8130021, 1.8130021]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0.58863074, 0.58863074, 0.58863074, 0.58863074],\n               [1.3727596, 1.3727596, 1.3727596, 1.3727596]],\n              [[0.28462553, 0.28462553, 0.28462553, 0.28462553],\n               [0.8378516, 0.8378516, 0.8378516, 0.8378516]]],\n             [[[0.13817799, 0.13817799, 0.13817799, 0.13817799],\n               [0.34856772, 0.34856772, 0.34856772, 0.34856772]],\n              [[0.7405102, 0.7405102, 0.7405102, 0.7405102],\n               [0.06438422, 0.06438422, 0.06438422, 0.06438422]],\n              [[0.8491963, 0.8491963, 0.8491963, 0.8491963],\n               [1.1301711, 1.1301711, 1.1301711, 1.1301711]],\n              [[0.6887394, 0.6887394, 0.6887394, 0.6887394],\n               [0.22089851, 0.22089851, 0.22089851, 0.22089851]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0.605832, 0.605832, 0.605832, 0.605832],\n               [0.92364264, 0.92364264, 0.92364264, 0.92364264]],\n              [[0.23089725, 0.23089725, 0.23089725, 0.23089725],\n               [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()\n        expected_centers_grad = torch.tensor(\n            [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[-1.0493311, -1.0493311, -1.0493311, -1.0493311],\n               [-2.0301602, -2.0301602, -2.0301602, -2.0301602]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[-1.6328557, -1.6328557, -1.6328557, -1.6328557],\n               [-3.1828144, -3.1828144, -3.1828144, -3.1828144]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]]],\n             [[[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[0., 0., 0., 0.], [0., 0., 0., 0.]],\n              [[-1.5429721, -1.5429721, -1.5429721, -1.5429721],\n               [-1.6100934, -1.6100934, -1.6100934, -1.6100934]],\n              [[-1.7103812, -1.7103812, -1.7103812, -1.7103812],\n               [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()\n        assert torch.allclose(\n            scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)\n        assert torch.allclose(\n            points.grad.detach().cpu(), expected_points_grad, atol=1e-6)\n        assert torch.allclose(\n            centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)\n"
  },
  {
    "path": "tests/test_ops/test_ball_query.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import ball_query\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_ball_query(device):\n    new_xyz = torch.tensor(\n        [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],\n          [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],\n          [-0.0740, 1.3147, -1.3625]],\n         [[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875],\n          [0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708],\n          [-2.0289, 2.4952, -0.1708]]],\n        device=device)\n\n    xyz = torch.tensor(\n        [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],\n          [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],\n          [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],\n          [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],\n          [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]],\n         [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],\n          [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],\n          [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],\n          [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],\n          [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]],\n        device=device)\n\n    idx = ball_query(0, 0.2, 5, xyz, new_xyz)\n    expected_idx = torch.tensor(\n        [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],\n          [0, 0, 0, 0, 0]],\n         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],\n          [0, 0, 0, 0, 0]]],\n        device=device)\n    assert torch.all(idx == expected_idx)\n\n    # test dilated ball query\n    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)\n    expected_idx = torch.tensor(\n        [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],\n          [0, 5, 7, 0, 0]],\n         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],\n          [0, 0, 0, 0, 0]]],\n        device=device)\n    assert torch.all(idx == expected_idx)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_stack_ball_query(device):\n    new_xyz = torch.tensor(\n        [[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],\n         [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],\n         [-0.0740, 1.3147, -1.3625], [-2.0289, 2.4952, -0.1708],\n         [-2.0668, 6.0278, -0.4875], [0.4066, 1.4211, -0.2947],\n         [-2.0289, 2.4952, -0.1708], [-2.0289, 2.4952, -0.1708]],\n        device=device)\n    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32, device=device)\n    xyz = torch.tensor([[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],\n                        [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],\n                        [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],\n                        [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],\n                        [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496],\n                        [-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],\n                        [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],\n                        [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],\n                        [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],\n                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]],\n                       device=device)\n    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32, device=device)\n    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)\n    expected_idx = torch.tensor(\n        [[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],\n         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],\n         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],\n        device=device)\n    assert torch.all(idx == expected_idx)\n\n    if device == 'cuda' or device == 'npu':\n        xyz = xyz.double()\n        new_xyz = new_xyz.double()\n        expected_idx = expected_idx.double()\n        idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt,\n                         new_xyz_batch_cnt)\n        assert torch.all(idx == expected_idx)\n\n    xyz = xyz.half()\n    new_xyz = new_xyz.half()\n    expected_idx = expected_idx.half()\n    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)\n    assert torch.all(idx == expected_idx)\n"
  },
  {
    "path": "tests/test_ops/test_bbox.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\nfrom mmengine.utils import digit_version\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,\n                        IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE)\n\n\nclass TestBBox:\n\n    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):\n        from mmcv.ops import bbox_overlaps\n        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],\n                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)\n        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,\n                                                  3.0]]).to(device).type(dtype)\n        should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])\n        out = bbox_overlaps(b1, b2, offset=1)\n        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)\n\n        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,\n                                                  4.0]]).to(device).type(dtype)\n        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,\n                                                  3.0]]).to(device).type(dtype)\n        should_output = np.array([0.33333334, 0.5])\n        out = bbox_overlaps(b1, b2, aligned=True, offset=1)\n        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)\n\n        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)\n        b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],\n                           [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,\n                                                  3.0]]).to(device).type(dtype)\n        should_output = np.array([0, 0.2, 0.5, 0.5])\n        out = bbox_overlaps(b1, b2, offset=1)\n        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)\n\n        b1 = torch.tensor([[10.0 + i, 10.0 + i, 30.0 + i, 30.0 + i]\n                           for i in range(1000)]).to(device).type(dtype)\n        b2 = torch.tensor([[20.0 + i, 20.0 + i, 40.0 + i, 40.0 + i]\n                           for i in range(1000)]).to(device).type(dtype)\n        should_output = np.array([1 / 7] * 1000)\n        out = bbox_overlaps(b1, b2, aligned=True)\n        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'mps',\n            marks=pytest.mark.skipif(\n                not IS_MPS_AVAILABLE\n                or digit_version(torch.__version__) >= digit_version('2.1.0'),\n                reason='requires MPS support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_bbox_overlaps_float(self, device):\n        self._test_bbox_overlaps(device, dtype=torch.float)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_bbox_overlaps_half(self, device):\n        self._test_bbox_overlaps(device, dtype=torch.half)\n"
  },
  {
    "path": "tests/test_ops/test_bezier_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\ninputs = ([[[\n    [1., 2., 5., 6.],\n    [3., 4., 7., 8.],\n    [9., 10., 13., 14.],\n    [11., 12., 15., 16.],\n]]], [[0., 0., 0., 1, 0., 2., 0., 3., 0., 3., 3., 2., 3., 1., 3., 0., 3.]])\noutputs = ([[[[1., 1.75, 3.5, 5.25], [2.5, 3.25, 5., 6.75],\n              [6., 6.75, 8.5, 10.25],\n              [9.5, 10.25, 12., 13.75]]]], [[[[1.5625, 1.5625, 1.5625, 0.3125],\n                                              [1.5625, 1.5625, 1.5625, 0.3125],\n                                              [1.5625, 1.5625, 1.5625, 0.3125],\n                                              [0.3125, 0.3125, 0.3125,\n                                               0.0625]]]])\n\n\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype', [\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=[\n            pytest.mark.skipif(\n                IS_MUSA_AVAILABLE,\n                reason='MUSA does not support for 64-bit floating point')\n        ]), torch.half\n])\ndef test_bezieralign(device, dtype):\n    try:\n        from mmcv.ops import bezier_align\n    except ModuleNotFoundError:\n        pytest.skip('test requires compilation')\n    pool_h = 4\n    pool_w = 4\n    spatial_scale = 1.0\n    sampling_ratio = 1\n    np_input = np.array(inputs[0])\n    np_rois = np.array(inputs[1])\n    np_output = np.array(outputs[0])\n    np_grad = np.array(outputs[1])\n\n    x = torch.tensor(np_input, dtype=dtype, device=device, requires_grad=True)\n    rois = torch.tensor(np_rois, dtype=dtype, device=device)\n\n    output = bezier_align(x, rois, (pool_h, pool_w), spatial_scale,\n                          sampling_ratio, False)\n    output.backward(torch.ones_like(output))\n    assert np.allclose(\n        output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)\n    assert np.allclose(\n        x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_bias_act.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import bias_act\nfrom mmcv.ops.bias_act import EasyDict\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck, gradgradcheck\n    _USING_PARROTS = False\n\n\nclass TestBiasAct:\n\n    @classmethod\n    def setup_class(cls):\n        cls.input_tensor = torch.randn((1, 3), requires_grad=True)\n        cls.bias = torch.randn(3, requires_grad=True)\n\n    def test_bias_act_cpu(self):\n        out = bias_act(self.input_tensor, self.bias)\n        assert out.shape == (1, 3)\n\n        # test with different dim\n        input_tensor = torch.randn((1, 1, 3), requires_grad=True)\n        bias = torch.randn(3, requires_grad=True)\n        out = bias_act(input_tensor, bias, dim=2)\n        assert out.shape == (1, 1, 3)\n\n        # test with different act\n        out = bias_act(self.input_tensor, self.bias, act='relu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='lrelu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='tanh')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='sigmoid')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='elu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='selu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='softplus')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor, self.bias, act='swish')\n        assert out.shape == (1, 3)\n\n        # test with different alpha\n        out = bias_act(self.input_tensor, self.bias, act='lrelu', alpha=0.1)\n        assert out.shape == (1, 3)\n\n        # test with different gain\n        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.2)\n        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.1)\n        assert torch.allclose(out1, out2 * 2)\n\n        # test with different clamp\n        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.5)\n        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.2)\n        assert out1.max() <= 0.5\n        assert out2.max() <= 0.5\n\n    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')\n    def test_bias_act_cuda(self):\n        if _USING_PARROTS:\n            gradcheck(\n                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),\n                delta=1e-4,\n                pt_atol=1e-3)\n        else:\n            gradcheck(\n                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),\n                eps=1e-4,\n                atol=1e-3)\n\n            gradgradcheck(\n                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),\n                eps=1e-4,\n                atol=1e-3)\n\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda())\n        assert out.shape == (1, 3)\n\n        # test with different dim\n        input_tensor = torch.randn((1, 1, 3), requires_grad=True).cuda()\n        bias = torch.randn(3, requires_grad=True).cuda()\n        out = bias_act(input_tensor, bias, dim=2)\n        assert out.shape == (1, 1, 3)\n\n        # test with different act\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='relu')\n        assert out.shape == (1, 3)\n\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='lrelu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='tanh')\n        assert out.shape == (1, 3)\n        out = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='sigmoid')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='elu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='selu')\n        assert out.shape == (1, 3)\n        out = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='softplus')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='swish')\n        assert out.shape == (1, 3)\n\n        # test with different alpha\n        out = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', alpha=0.1)\n        assert out.shape == (1, 3)\n\n        # test with different gain\n        out1 = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.2)\n        out2 = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.1)\n        assert torch.allclose(out1, out2 * 2)\n\n        # test with different clamp\n        out1 = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.5)\n        out2 = bias_act(\n            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.2)\n        assert out1.max() <= 0.5\n        assert out2.max() <= 0.5\n\n    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')\n    def test_bias_act_musa(self):\n        if _USING_PARROTS:\n            gradcheck(\n                bias_act, (self.input_tensor.musa(), self.bias.musa()),\n                delta=1e-4,\n                pt_atol=1e-3)\n        else:\n            gradcheck(\n                bias_act, (self.input_tensor.musa(), self.bias.musa()),\n                eps=1e-4,\n                atol=1e-3)\n\n            gradgradcheck(\n                bias_act, (self.input_tensor.musa(), self.bias.musa()),\n                eps=1e-4,\n                atol=1e-3)\n\n        out = bias_act(self.input_tensor.musa(), self.bias.musa())\n        assert out.shape == (1, 3)\n\n        # test with different dim\n        input_tensor = torch.randn((1, 1, 3), requires_grad=True).musa()\n        bias = torch.randn(3, requires_grad=True).musa()\n        out = bias_act(input_tensor, bias, dim=2)\n        assert out.shape == (1, 1, 3)\n\n        # test with different act\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='relu')\n        assert out.shape == (1, 3)\n\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='lrelu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='tanh')\n        assert out.shape == (1, 3)\n        out = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='sigmoid')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='elu')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='selu')\n        assert out.shape == (1, 3)\n        out = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='softplus')\n        assert out.shape == (1, 3)\n        out = bias_act(self.input_tensor.musa(), self.bias.musa(), act='swish')\n        assert out.shape == (1, 3)\n\n        # test with different alpha\n        out = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='lrelu', alpha=0.1)\n        assert out.shape == (1, 3)\n\n        # test with different gain\n        out1 = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='lrelu', gain=0.2)\n        out2 = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='lrelu', gain=0.1)\n        assert torch.allclose(out1, out2 * 2)\n\n        # test with different clamp\n        out1 = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='lrelu', clamp=0.5)\n        out2 = bias_act(\n            self.input_tensor.musa(), self.bias.musa(), act='lrelu', clamp=0.2)\n        assert out1.max() <= 0.5\n        assert out2.max() <= 0.5\n\n    def test_easy_dict(self):\n        easy_dict = EasyDict(\n            func=lambda x, **_: x,\n            def_alpha=0,\n            def_gain=1,\n            cuda_idx=1,\n            ref='',\n            has_2nd_grad=False)\n        _ = easy_dict.def_alpha\n        easy_dict.def_alpha = 1\n        del easy_dict.def_alpha\n\n    def test_easy_dict_musa(self):\n        easy_dict = EasyDict(\n            func=lambda x, **_: x,\n            def_alpha=0,\n            def_gain=1,\n            musa_idx=1,\n            ref='',\n            has_2nd_grad=False)\n        _ = easy_dict.def_alpha\n        easy_dict.def_alpha = 1\n        del easy_dict.def_alpha\n"
  },
  {
    "path": "tests/test_ops/test_bilinear_grid_sample.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\n\n\nclass TestBilinearGridSample:\n\n    def _test_bilinear_grid_sample(self,\n                                   dtype=torch.float,\n                                   align_corners=False,\n                                   multiplier=1,\n                                   precision=1e-3):\n        from mmcv.ops.point_sample import bilinear_grid_sample\n\n        input = torch.rand(1, 1, 20, 20, dtype=dtype)\n        grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])\n        grid = F.affine_grid(\n            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)\n        grid *= multiplier\n\n        out = bilinear_grid_sample(input, grid, align_corners=align_corners)\n        ref_out = F.grid_sample(input, grid, align_corners=align_corners)\n\n        assert np.allclose(out.data.detach().cpu().numpy(),\n                           ref_out.data.detach().cpu().numpy(), precision)\n\n    def test_bilinear_grid_sample(self):\n        self._test_bilinear_grid_sample(torch.double, False)\n        self._test_bilinear_grid_sample(torch.double, True)\n        self._test_bilinear_grid_sample(torch.float, False)\n        self._test_bilinear_grid_sample(torch.float, True)\n        self._test_bilinear_grid_sample(torch.float, False)\n        self._test_bilinear_grid_sample(torch.float, True, 5)\n        self._test_bilinear_grid_sample(torch.float, False, 10)\n        self._test_bilinear_grid_sample(torch.float, True, -6)\n        self._test_bilinear_grid_sample(torch.float, False, -10)\n        self._test_bilinear_grid_sample(torch.double, True, 5)\n        self._test_bilinear_grid_sample(torch.double, False, 10)\n        self._test_bilinear_grid_sample(torch.double, True, -6)\n        self._test_bilinear_grid_sample(torch.double, False, -10)\n"
  },
  {
    "path": "tests/test_ops/test_border_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\n\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\n# [1,4c,h,w]\ninput_arr = [[[[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]],\n              [[6, 7, 5, 8], [2, 1, 3, 4], [12, 9, 11, 10]],\n              [[-2, -3, 2, 0], [-4, -5, 1, -1], [-1, -1, -1, -1]],\n              [[0, -1, 2, 1], [-4, -3, -2, -1], [-1, -2, -3, -4]]]]\n# [1,h*w,4]\nboxes_arr = [[[0, 0, 2, 1], [1, 0, 3, 1], [1, 0, 2, 1], [0, 0, 3, 1],\n              [0, 0, 1, 2], [0, 0, 2, 2], [1, 0, 2, 1], [1, 0, 3, 1],\n              [0, 1, 1, 2], [0, 0, 3, 2], [1, 0, 3, 2], [2, 0, 3, 2]]]\noutput_dict = {\n    # [1,c,h*w,4] for each value,\n    # the output is manually checked for its correctness\n\n    # pool_size=1\n    1: [[[[3., 6., 1., 2.], [4., 7., -1., 1.], [3., 7., 1., 2.],\n          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],\n          [3., 7., 1., 2.], [4., 7., -1., 1.], [6., 12., -1., -2.],\n          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],\n\n    # pool_size=2\n    2: [[[[3., 6., 1., 2.], [4., 7., 1., 1.], [3., 7., 1., 2.],\n          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],\n          [3., 7., 1., 2.], [4., 7., 1., 1.], [6., 12., -1., -2.],\n          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],\n}\ninput_grad_dict = {\n    # [1,4c,h,w] for each value\n    # the grad is manually checked for its correctness\n\n    # pool_size=1\n    1: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],\n         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],\n         [[0., 0., 0., 0.], [0., 0., 3., 3.], [0., 2., 1., 3.]],\n         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],\n\n    # pool_size=2\n    2: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],\n         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],\n         [[0., 0., 0., 0.], [0., 0., 5., 1.], [0., 2., 1., 3.]],\n         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],\n}\n\n\ndef _test_border_align_allclose(device, dtype, pool_size):\n    if not torch.cuda.is_available() and device == 'cuda':\n        pytest.skip('test requires GPU')\n    elif not IS_MUSA_AVAILABLE and device == 'musa':\n        pytest.skip('test requires GPU')\n    try:\n        from mmcv.ops import BorderAlign, border_align\n    except ModuleNotFoundError:\n        pytest.skip('BorderAlign op is not successfully compiled')\n\n    np_input = np.array(input_arr)\n    np_boxes = np.array(boxes_arr)\n    np_output = np.array(output_dict[pool_size])\n    np_grad = np.array(input_grad_dict[pool_size])\n\n    input = torch.tensor(\n        np_input, dtype=dtype, device=device, requires_grad=True)\n    boxes = torch.tensor(np_boxes, dtype=dtype, device=device)\n\n    # test for border_align\n    input_cp = copy.deepcopy(input)\n    output = border_align(input_cp, boxes, pool_size)\n    output.backward(torch.ones_like(output))\n    assert np.allclose(\n        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)\n    assert np.allclose(\n        input_cp.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)\n\n    # test for BorderAlign\n    pool_module = BorderAlign(pool_size)\n    output = pool_module(input, boxes)\n    output.backward(torch.ones_like(output))\n    assert np.allclose(\n        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)\n    assert np.allclose(\n        input.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)\n\n\n@pytest.mark.parametrize('device', ['cuda', 'musa'])\n@pytest.mark.parametrize('dtype', [\n    torch.float,\n    torch.half,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MUSA_AVAILABLE,\n            reason='MUSA does not support for 64-bit floating point')),\n])\n@pytest.mark.parametrize('pool_size', [1, 2])\ndef test_border_align(device, dtype, pool_size):\n    _test_border_align_allclose(device, dtype, pool_size)\n"
  },
  {
    "path": "tests/test_ops/test_box_iou_quadri.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\nclass TestBoxIoUQuadri:\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_box_iou_quadri_cuda(self, device):\n        from mmcv.ops import box_iou_quadri\n        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],\n                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],\n                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],\n                               dtype=np.float32)\n        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],\n                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],\n                               dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.0714, 1.0000, 0.0000], [0.0000, 0.5000, 0.0000],\n             [0.0000, 0.0000, 0.5000]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.0714, 0.5000, 0.5000],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1).to(device)\n        boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n        ious = box_iou_quadri(boxes1, boxes2)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_quadri(boxes1, boxes2, aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_box_iou_quadri_iof_cuda(self, device):\n        from mmcv.ops import box_iou_quadri\n        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],\n                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],\n                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],\n                               dtype=np.float32)\n        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],\n                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],\n                               dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.1111, 1.0000, 0.0000], [0.0000, 1.0000, 0.0000],\n             [0.0000, 0.0000, 1.0000]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.1111, 1.0000, 1.0000],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1).to(device)\n        boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n        ious = box_iou_quadri(boxes1, boxes2, mode='iof')\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_quadri(boxes1, boxes2, mode='iof', aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_box_iou_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import box_iou_rotated\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\nclass TestBoxIoURotated:\n\n    def test_box_iou_rotated_cpu(self):\n        np_boxes1 = np.asarray(\n            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],\n             [7.0, 7.0, 8.0, 8.0, 0.4]],\n            dtype=np.float32)\n        np_boxes2 = np.asarray(\n            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],\n             [5.0, 5.0, 6.0, 7.0, 0.4]],\n            dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],\n             [0.0000, 0.0000, 0.3622]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1)\n        boxes2 = torch.from_numpy(np_boxes2)\n\n        # test cw angle definition\n        ious = box_iou_rotated(boxes1, boxes2)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(boxes1, boxes2, aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n        # test ccw angle definition\n        boxes1[..., -1] *= -1\n        boxes2[..., -1] *= -1\n        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_box_iou_rotated(self, device):\n        np_boxes1 = np.asarray(\n            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],\n             [7.0, 7.0, 8.0, 8.0, 0.4]],\n            dtype=np.float32)\n        np_boxes2 = np.asarray(\n            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],\n             [5.0, 5.0, 6.0, 7.0, 0.4]],\n            dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],\n             [0.0000, 0.0000, 0.3622]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1).to(device)\n        boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n        # test cw angle definition\n        ious = box_iou_rotated(boxes1, boxes2)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(boxes1, boxes2, aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n        # test ccw angle definition\n        boxes1[..., -1] *= -1\n        boxes2[..., -1] *= -1\n        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n    def test_box_iou_rotated_iof_cpu(self):\n        np_boxes1 = np.asarray(\n            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],\n             [7.0, 7.0, 8.0, 8.0, 0.4]],\n            dtype=np.float32)\n        np_boxes2 = np.asarray(\n            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],\n             [5.0, 5.0, 6.0, 7.0, 0.4]],\n            dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],\n             [0.0000, 0.0000, 0.4404]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1)\n        boxes2 = torch.from_numpy(np_boxes2)\n\n        # test cw angle definition\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof')\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n        # test ccw angle definition\n        boxes1[..., -1] *= -1\n        boxes2[..., -1] *= -1\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n        ious = box_iou_rotated(\n            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n    ])\n    def test_box_iou_rotated_iof(self, device):\n        np_boxes1 = np.asarray(\n            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],\n             [7.0, 7.0, 8.0, 8.0, 0.4]],\n            dtype=np.float32)\n        np_boxes2 = np.asarray(\n            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],\n             [5.0, 5.0, 6.0, 7.0, 0.4]],\n            dtype=np.float32)\n        np_expect_ious = np.asarray(\n            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],\n             [0.0000, 0.0000, 0.4404]],\n            dtype=np.float32)\n        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],\n                                            dtype=np.float32)\n\n        boxes1 = torch.from_numpy(np_boxes1).to(device)\n        boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n        # test cw angle definition\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof')\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n\n        # test ccw angle definition\n        boxes1[..., -1] *= -1\n        boxes2[..., -1] *= -1\n        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)\n        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n        ious = box_iou_rotated(\n            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)\n        assert np.allclose(\n            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_carafe.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\nfrom torch.autograd import gradcheck\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\n\nclass TestCarafe:\n\n    def test_carafe_naive_gradcheck(self):\n        if (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE):\n            return\n        from mmcv.ops import CARAFENaive\n        if IS_CUDA_AVAILABLE:\n            feat = torch.randn(\n                2, 64, 3, 3, requires_grad=True, device='cuda').double()\n            mask = torch.randn(\n                2, 100, 6, 6, requires_grad=True,\n                device='cuda').sigmoid().double()\n            gradcheck(CARAFENaive(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)\n\n    def test_carafe_gradcheck(self):\n        if (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE):\n            return\n        from mmcv.ops import CARAFE\n        if IS_CUDA_AVAILABLE:\n            feat = torch.randn(\n                2, 64, 3, 3, requires_grad=True, device='cuda').double()\n            mask = torch.randn(\n                2, 100, 6, 6, requires_grad=True,\n                device='cuda').sigmoid().double()\n            gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_carafe_allclose(self, device):\n        try:\n            from mmcv.ops import CARAFE\n        except ModuleNotFoundError:\n            pytest.skip('test requires compilation')\n\n        np_feat = np.fromfile(\n            'tests/data/for_carafe/carafe_feat.bin', dtype=np.float32)\n        np_mask = np.fromfile(\n            'tests/data/for_carafe/carafe_mask.bin', dtype=np.float32)\n        np_output = np.fromfile(\n            'tests/data/for_carafe/carafe_output.bin', dtype=np.float32)\n        np_feat_grad = np.fromfile(\n            'tests/data/for_carafe/carafe_feat_grad.bin', dtype=np.float32)\n        np_mask_grad = np.fromfile(\n            'tests/data/for_carafe/carafe_mask_grad.bin', dtype=np.float32)\n\n        np_feat = np_feat.reshape((2, 64, 3, 3))\n        np_mask = np_mask.reshape((2, 100, 6, 6))\n        np_output = np_output.reshape((2, 64, 6, 6))\n        np_feat_grad = np_feat_grad.reshape((2, 64, 3, 3))\n        np_mask_grad = np_mask_grad.reshape((2, 100, 6, 6))\n\n        # feat = torch.tensor(\n        #     np_feat, dtype=torch.float, device=device, requires_grad=True)\n        # mask = torch.tensor(\n        #     np_mask, dtype=torch.float, device=device, requires_grad=True)\n\n        # feat = torch.tensor(\n        #     np_feat, dtype=torch.float, requires_grad=True).to(device)\n        # mask = torch.tensor(\n        #     np_mask, dtype=torch.float, requires_grad=True).to(device)\n        # feat = torch.tensor(\n        #     np_feat, dtype=torch.float).to(device)\n        # mask = torch.tensor(\n        #     np_mask, dtype=torch.float).to(device)\n        # feat_cpu = torch.from_numpy(np_feat).to(torch.float)\n        # mask_cpu = torch.from_numpy(np_mask).to(torch.float)\n\n        # if device == 'musa':\n        #     feat =feat_cpu.musa()\n        #     mask =mask_cpu.musa()\n        # else:\n        #     feat =feat_cpu.to(device)\n        #     mask =mask_cpu.to(device)\n        # feat.requires_grad = True\n        # mask.requires_grad = True\n        feat_cpu = torch.FloatTensor(np_feat)\n        mask_cpu = torch.FloatTensor(np_mask)\n        feat = feat_cpu.to(device)\n        mask = mask_cpu.to(device)\n        feat.requires_grad = True\n        mask.requires_grad = True\n        # pytest.set_trace()\n\n        carafe = CARAFE(5, 4, 2)\n        carafe.to(device)\n        carafe.train()\n        output = carafe(feat, mask)\n\n        output.backward(torch.ones_like(output))\n        assert np.allclose(\n            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)\n        assert np.allclose(\n            feat.grad.data.type(torch.float).cpu().numpy(),\n            np_feat_grad,\n            atol=1e-3)\n        assert np.allclose(\n            mask.grad.data.type(torch.float).cpu().numpy(),\n            np_mask_grad,\n            atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_cc_attention.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\n\nclass Loss(nn.Module):\n\n    def __init__(self):\n        super().__init__()\n\n    def forward(self, input, target):\n        input = input.view(-1)\n        target = target.view(-1)\n        return torch.mean(input - target)\n\n\nclass TestCrissCrossAttention:\n\n    def test_cc_attention(self):\n        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n\n        if IS_MUSA_AVAILABLE:\n            device = torch.device('musa:0')\n\n        from mmcv.ops import CrissCrossAttention\n        loss_func = Loss()\n\n        input = np.fromfile(\n            'tests/data/for_ccattention/ccattention_input.bin',\n            dtype=np.float32)\n        output = np.fromfile(\n            'tests/data/for_ccattention/ccattention_output.bin',\n            dtype=np.float32)\n        input = input.reshape((1, 32, 45, 45))\n        output = output.reshape((1, 32, 45, 45))\n        label = torch.ones((1, 32, 45, 45))\n\n        input = torch.FloatTensor(input)\n        output = torch.FloatTensor(output)\n\n        input.requires_grad = True\n\n        shape = input.shape\n        channel = shape[1]\n\n        cca = CrissCrossAttention(channel)\n        cca.to(device)\n        input = input.to(device)\n        label = label.to(device)\n        cca.train()\n        test_output = cca(input)\n        test_loss = loss_func(test_output, label)\n        test_loss.backward()\n        test_output = test_output.detach().cpu().numpy()\n        output = output.numpy()\n\n        assert np.allclose(test_output, output)\n        assert test_output.shape == shape\n"
  },
  {
    "path": "tests/test_ops/test_chamfer_distance.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import chamfer_distance\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\ndef chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype):\n    bs, ns, ss = xyz1.shape\n    dist1 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))\n    dist2 = np.zeros((bs, ns)).astype(torch_to_np_type(dtype))\n    idx1 = np.zeros((bs, ns)).astype('int32')\n    idx2 = np.zeros((bs, ns)).astype('int32')\n    for b1 in range(bs):\n        for n1 in range(ns):\n            x1, y1 = xyz1[b1][n1]\n            dist1[b1][n1] = 10000000\n            for n2 in range(ns):\n                x2, y2 = xyz2[b1][n2]\n                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)\n                if dist1[b1][n1] > dst:\n                    dist1[b1][n1] = dst\n                    idx1[b1][n1] = n2\n    for b1 in range(bs):\n        for n1 in range(ns):\n            x1, y1 = xyz2[b1][n1]\n            dist2[b1][n1] = 10000000\n            for n2 in range(ns):\n                x2, y2 = xyz1[b1][n2]\n                dst = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)\n                if dist2[b1][n1] > dst:\n                    dist2[b1][n1] = dst\n                    idx2[b1][n1] = n2\n    return [dist1, dist2, idx1, idx2]\n\n\ndef torch_to_np_type(dtype):\n    if dtype == torch.half:\n        return np.float16\n    elif dtype == torch.float32:\n        return np.float32\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype', [torch.half, torch.float32])\n@pytest.mark.parametrize('shape', [(2, 600, 2), (1, 1, 2), (7, 7, 2)])\ndef test_chamfer_distance_npu_dynamic_shape(dtype, device, shape):\n    bs = shape[0]\n    ns = shape[1]\n    xyz1 = np.random.uniform(-10.0, 10.0,\n                             (bs, ns, 2)).astype(torch_to_np_type(dtype))\n    xyz2 = np.random.uniform(-10.0, 10.0,\n                             (bs, ns, 2)).astype(torch_to_np_type(dtype))\n    xyz1_npu = torch.tensor(xyz1, dtype=dtype).to(device)\n    xyz2_npu = torch.tensor(xyz2, dtype=dtype).to(device)\n    expected_output = chamfer_distance_forward_groundtruth(xyz1, xyz2, dtype)\n    output = chamfer_distance(xyz1_npu, xyz2_npu)\n    assert np.allclose(output[0].cpu().numpy(), expected_output[0], 1e-3, 1e-4)\n    assert np.allclose(output[1].cpu().numpy(), expected_output[1], 1e-3, 1e-4)\n    assert np.allclose(output[2].cpu().numpy(), expected_output[2], 1e-3, 1e-4)\n    assert np.allclose(output[3].cpu().numpy(), expected_output[3], 1e-3, 1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_contour_expand.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport torch\n\n\ndef test_contour_expand():\n    from mmcv.ops import contour_expand\n\n    np_internal_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],\n                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],\n                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],\n                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],\n                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                         [0, 0, 0, 0, 0, 0, 0, 0, 0,\n                                          0]]).astype(np.int32)\n    np_kernel_mask1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],\n                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],\n                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],\n                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0,\n                                 0]]).astype(np.uint8)\n    np_kernel_mask2 = (np_internal_kernel_label > 0).astype(np.uint8)\n\n    np_kernel_mask = np.stack([np_kernel_mask1, np_kernel_mask2])\n    min_area = 1\n    kernel_region_num = 3\n    result = contour_expand(np_kernel_mask, np_internal_kernel_label, min_area,\n                            kernel_region_num)\n    gt = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],\n          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],\n          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n    assert np.allclose(result, gt)\n\n    np_kernel_mask_t = torch.from_numpy(np_kernel_mask)\n    np_internal_kernel_label_t = torch.from_numpy(np_internal_kernel_label)\n    result = contour_expand(np_kernel_mask_t, np_internal_kernel_label_t,\n                            min_area, kernel_region_num)\n    assert np.allclose(result, gt)\n"
  },
  {
    "path": "tests/test_ops/test_conv_gradfix.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import gradcheck, gradgradcheck\n\nfrom mmcv.ops import conv2d, conv_transpose2d\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\n\nclass TestCond2d:\n\n    @classmethod\n    def setup_class(cls):\n        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)\n        cls.weight = nn.Parameter(torch.randn(1, 3, 3, 3))\n\n    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')\n    def test_conv2d_cuda(self):\n        x = self.input.cuda()\n        weight = self.weight.cuda()\n        res = conv2d(x, weight, None, 1, 1)\n        assert res.shape == (1, 1, 32, 32)\n        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)\n        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)\n\n    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')\n    def test_conv2d_musa(self):\n        x = self.input.musa()\n        weight = self.weight.musa()\n        res = conv2d(x, weight, None, 1, 1)\n        assert res.shape == (1, 1, 32, 32)\n        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)\n        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)\n\n\nclass TestCond2dTansposed:\n\n    @classmethod\n    def setup_class(cls):\n        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)\n        cls.weight = nn.Parameter(torch.randn(3, 1, 3, 3))\n\n    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')\n    def test_conv2d_transposed_cuda(self):\n        x = self.input.cuda()\n        weight = self.weight.cuda()\n        res = conv_transpose2d(x, weight, None, 1, 1)\n        assert res.shape == (1, 1, 32, 32)\n        gradcheck(\n            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)\n        gradgradcheck(\n            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)\n\n    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')\n    def test_conv2d_transposed_musa(self):\n        x = self.input.musa()\n        weight = self.weight.musa()\n        res = conv_transpose2d(x, weight, None, 1, 1)\n        assert res.shape == (1, 1, 32, 32)\n        gradcheck(\n            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)\n        gradgradcheck(\n            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)\n"
  },
  {
    "path": "tests/test_ops/test_convex_iou.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import convex_giou, convex_iou\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\nnp_pointsets = np.asarray([[\n    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,\n    2.0, 1.5, 1.5\n],\n                           [\n                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,\n                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0\n                           ]])\n\nnp_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],\n                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])\n\nnp_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])\n\nnp_expected_giou = np.asarray([0.2857, 0.3831])\n\nnp_expected_grad = np.asarray([[\n    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,\n    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,\n    0.0000\n],\n                               [\n                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,\n                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,\n                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,\n                                   -0.0367, 0.0000, 0.0000\n                               ]])\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\ndef test_convex_iou():\n    pointsets = torch.from_numpy(np_pointsets).cuda().float()\n    polygons = torch.from_numpy(np_polygons).cuda().float()\n    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()\n    assert torch.allclose(\n        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\ndef test_convex_giou():\n    pointsets = torch.from_numpy(np_pointsets).cuda().float()\n    polygons = torch.from_numpy(np_polygons).cuda().float()\n    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()\n    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()\n    giou, grad = convex_giou(pointsets, polygons)\n    assert torch.allclose(giou, expected_giou, atol=1e-3)\n    assert torch.allclose(grad, expected_grad, atol=1e-3)\n\n\n@pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')\ndef test_convex_miou():\n    pointsets = torch.from_numpy(np_pointsets).musa().float()\n    polygons = torch.from_numpy(np_polygons).musa().float()\n    expected_iou = torch.from_numpy(np_expected_iou).musa().float()\n    assert torch.allclose(\n        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_corner_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\"\"\"\nCommandLine:\n    pytest tests/test_corner_pool.py\n\"\"\"\nimport pytest\nimport torch\n\nfrom mmcv.ops import CornerPool\n\n\ndef test_corner_pool_device_and_dtypes_cpu():\n    \"\"\"\n    CommandLine:\n        xdoctest -m tests/test_corner_pool.py \\\n            test_corner_pool_device_and_dtypes_cpu\n    \"\"\"\n    with pytest.raises(AssertionError):\n        # pool mode must in ['bottom', 'left', 'right', 'top']\n        pool = CornerPool('corner')\n\n    lr_tensor = torch.tensor([[[[0, 0, 0, 0, 0], [2, 1, 3, 0, 2],\n                                [5, 4, 1, 1, 6], [0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0]]]])\n    tb_tensor = torch.tensor([[[[0, 3, 1, 0, 0], [0, 1, 1, 0, 0],\n                                [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],\n                                [0, 0, 2, 0, 0]]]])\n    # Left Pool\n    left_answer = torch.tensor([[[[0, 0, 0, 0, 0], [3, 3, 3, 2, 2],\n                                  [6, 6, 6, 6, 6], [0, 0, 0, 0, 0],\n                                  [0, 0, 0, 0, 0]]]])\n    pool = CornerPool('left')\n    left_tensor = pool(lr_tensor)\n    assert left_tensor.type() == lr_tensor.type()\n    assert torch.equal(left_tensor, left_answer)\n    # Right Pool\n    right_answer = torch.tensor([[[[0, 0, 0, 0, 0], [2, 2, 3, 3, 3],\n                                   [5, 5, 5, 5, 6], [0, 0, 0, 0, 0],\n                                   [0, 0, 0, 0, 0]]]])\n    pool = CornerPool('right')\n    right_tensor = pool(lr_tensor)\n    assert right_tensor.type() == lr_tensor.type()\n    assert torch.equal(right_tensor, right_answer)\n    # Top Pool\n    top_answer = torch.tensor([[[[0, 3, 4, 0, 0], [0, 3, 4, 0, 0],\n                                 [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],\n                                 [0, 0, 2, 0, 0]]]])\n    pool = CornerPool('top')\n    top_tensor = pool(tb_tensor)\n    assert top_tensor.type() == tb_tensor.type()\n    assert torch.equal(top_tensor, top_answer)\n    # Bottom Pool\n    bottom_answer = torch.tensor([[[[0, 3, 1, 0, 0], [0, 3, 1, 0, 0],\n                                    [0, 3, 4, 0, 0], [0, 3, 4, 0, 0],\n                                    [0, 3, 4, 0, 0]]]])\n    pool = CornerPool('bottom')\n    bottom_tensor = pool(tb_tensor)\n    assert bottom_tensor.type() == tb_tensor.type()\n    assert torch.equal(bottom_tensor, bottom_answer)\n"
  },
  {
    "path": "tests/test_ops/test_correlation.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import Correlation\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\n_input1 = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]\n_input2 = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]\n\ngt_out_shape = (1, 1, 1, 3, 3)\n_gt_out = [[[[[1., 4., 9.], [0., 1., 4.], [24., 25., 4.]]]]]\ngt_input1_grad = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]\n\n\ndef assert_equal_tensor(tensor_a, tensor_b):\n\n    assert tensor_a.eq(tensor_b).all()\n\n\nclass TestCorrelation:\n\n    def _test_correlation(self, dtype=torch.float):\n\n        layer = Correlation(max_displacement=0)\n\n        if IS_CUDA_AVAILABLE:\n            input1 = torch.tensor(_input1, dtype=dtype).cuda()\n            input2 = torch.tensor(_input2, dtype=dtype).cuda()\n        elif IS_MUSA_AVAILABLE:\n            input1 = torch.tensor(_input1, dtype=dtype).musa()\n            input2 = torch.tensor(_input2, dtype=dtype).musa()\n        input1.requires_grad = True\n        input2.requires_grad = True\n        out = layer(input1, input2)\n        out.backward(torch.ones_like(out))\n\n        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,\n        # so we need to make a comparison for cuda/musa tensor\n        # rather than cpu tensor\n        if IS_CUDA_AVAILABLE:\n            gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()\n        elif IS_MUSA_AVAILABLE:\n            gt_out = torch.tensor(_gt_out, dtype=dtype).musa()\n        assert_equal_tensor(out, gt_out)\n        assert_equal_tensor(input1.grad.detach(), input2)\n        assert_equal_tensor(input2.grad.detach(), input1)\n\n    @pytest.mark.skipif(\n        (not torch.cuda.is_available()) and (not IS_MUSA_AVAILABLE),\n        reason='requires CUDA/MUSA support')\n    def test_correlation(self):\n        self._test_correlation(torch.float)\n        if IS_CUDA_AVAILABLE:\n            self._test_correlation(torch.double)\n        self._test_correlation(torch.half)\n"
  },
  {
    "path": "tests/test_ops/test_deform_conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nif IS_MLU_AVAILABLE:\n    torch.backends.cnnl.allow_tf32 = False\n\nif IS_MUSA_AVAILABLE:\n    try:\n        from torch_musa.core.amp import autocast\n    except ImportError:\n        pass\nelse:\n    try:\n        # If PyTorch version >= 1.6.0 and fp16 is enabled\n        # torch.cuda.amp.autocast would be imported and used\n        # we should test if our modules support it.\n        from torch.cuda.amp import autocast\n    except ImportError:\n        pass\n\ninput = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]\noffset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],\n                 [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],\n                 [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],\n                 [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]\noffset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]\ndeform_weight = [[[0.4, 0.2, 0.1, 0.9]]]\n\ngt_out = [[[[1.650, 0.], [0.000, 0.]]]]\ngt_x_grad = [[[[-0.666, 0.204, 0.000], [0.030, -0.416, 0.012],\n               [0.000, 0.252, 0.129]]]]\ngt_offset_weight_grad = [[[[1.44, 2.88], [0.00, 1.44]]],\n                         [[[-0.72, -1.44], [0.00, -0.72]]],\n                         [[[0.00, 0.00], [0.00, 0.00]]],\n                         [[[0.00, 0.00], [0.00, 0.00]]],\n                         [[[-0.10, -0.20], [0.00, -0.10]]],\n                         [[[-0.08, -0.16], [0.00, -0.08]]],\n                         [[[-0.54, -1.08], [0.00, -0.54]]],\n                         [[[-0.54, -1.08], [0.00, -0.54]]]]\ngt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],\ngt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]\n\n\nclass TestDeformconv:\n\n    def _test_deformconv(self,\n                         dtype=torch.float,\n                         threshold=1e-3,\n                         device='cuda',\n                         batch_size=10,\n                         im2col_step=2):\n        if not torch.cuda.is_available() and device == 'cuda':\n            pytest.skip('test requires GPU')\n        if device == 'mlu':\n            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack\n        else:\n            from mmcv.ops import DeformConv2dPack\n        c_in = 1\n        c_out = 1\n        batch_size = 10\n        repeated_input = np.repeat(input, batch_size, axis=0)\n        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)\n        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)\n        x = torch.tensor(repeated_input, device=device, dtype=dtype)\n        x.requires_grad = True\n        model = DeformConv2dPack(\n            in_channels=c_in,\n            out_channels=c_out,\n            kernel_size=2,\n            stride=1,\n            padding=0,\n            im2col_step=im2col_step)\n        model.conv_offset.weight.data = torch.nn.Parameter(\n            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))\n        model.conv_offset.bias.data = torch.nn.Parameter(\n            torch.Tensor(offset_bias).reshape(8))\n        model.weight.data = torch.nn.Parameter(\n            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))\n        if device == 'cuda':\n            model.cuda()\n        elif device == 'mlu':\n            model.mlu()\n        elif device == 'musa':\n            model.musa()\n        model.type(dtype)\n\n        out = model(x)\n        out.backward(torch.ones_like(out))\n\n        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,\n                           threshold)\n        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,\n                           threshold)\n        # the batch size of the input is increased which results in\n        # a larger gradient so we need to divide by the batch_size\n        assert np.allclose(\n            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,\n            gt_offset_weight_grad, threshold)\n        assert np.allclose(\n            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,\n            gt_offset_bias_grad, threshold)\n        assert np.allclose(\n            model.weight.grad.detach().cpu().numpy() / batch_size,\n            gt_deform_weight_grad, threshold)\n\n        from mmcv.ops import DeformConv2d\n\n        # test bias\n        model = DeformConv2d(1, 1, 2, stride=1, padding=0)\n        assert not hasattr(model, 'bias')\n        # test bias=True\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)\n        # test in_channels % group != 0\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(3, 2, 3, groups=2)\n        # test out_channels % group != 0\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(3, 4, 3, groups=3)\n\n    def _test_amp_deformconv(self,\n                             input_dtype,\n                             threshold=1e-3,\n                             device='cuda',\n                             batch_size=10,\n                             im2col_step=2):\n        \"\"\"The function to test amp released on pytorch 1.6.0.\n\n        The type of input data might be torch.float or torch.half,\n        so we should test deform_conv in both cases. With amp, the\n        data type of model will NOT be set manually.\n\n        Args:\n            input_dtype: torch.float or torch.half.\n            threshold: the same as above function.\n        \"\"\"\n        if not torch.cuda.is_available() and device == 'cuda':\n            return\n        if device == 'mlu':\n            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack\n        else:\n            from mmcv.ops import DeformConv2dPack\n        c_in = 1\n        c_out = 1\n        repeated_input = np.repeat(input, batch_size, axis=0)\n        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)\n        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)\n        x = torch.Tensor(repeated_input).to(device).type(input_dtype)\n        x.requires_grad = True\n        model = DeformConv2dPack(\n            in_channels=c_in,\n            out_channels=c_out,\n            kernel_size=2,\n            stride=1,\n            padding=0,\n            im2col_step=im2col_step)\n        model.conv_offset.weight.data = torch.nn.Parameter(\n            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))\n        model.conv_offset.bias.data = torch.nn.Parameter(\n            torch.Tensor(offset_bias).reshape(8))\n        model.weight.data = torch.nn.Parameter(\n            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))\n        if device == 'cuda':\n            model.cuda()\n        elif device == 'mlu':\n            model.mlu()\n        elif device == 'musa':\n            model.musa()\n\n        out = model(x)\n        out.backward(torch.ones_like(out))\n\n        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,\n                           threshold)\n        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,\n                           threshold)\n        assert np.allclose(\n            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,\n            gt_offset_weight_grad, threshold)\n        assert np.allclose(\n            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,\n            gt_offset_bias_grad, threshold)\n        assert np.allclose(\n            model.weight.grad.detach().cpu().numpy() / batch_size,\n            gt_deform_weight_grad, threshold)\n\n        from mmcv.ops import DeformConv2d\n\n        # test bias\n        model = DeformConv2d(1, 1, 2, stride=1, padding=0)\n        assert not hasattr(model, 'bias')\n        # test bias=True\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)\n        # test in_channels % group != 0\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(3, 2, 3, groups=2)\n        # test out_channels % group != 0\n        with pytest.raises(AssertionError):\n            model = DeformConv2d(3, 4, 3, groups=3)\n\n    @pytest.mark.parametrize(\n        'device, threshold',\n        [('cpu', 1e-1),\n         pytest.param(\n             'cuda',\n             1e-3,\n             marks=pytest.mark.skipif(\n                 not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n         pytest.param(\n             'mlu',\n             1e-3,\n             marks=pytest.mark.skipif(\n                 not IS_MLU_AVAILABLE, reason='requires MLU support')),\n         pytest.param(\n             'musa',\n             1e-3,\n             marks=pytest.mark.skipif(\n                 not IS_MUSA_AVAILABLE, reason='requires MUSA support'))])\n    def test_deformconv_float(self, device, threshold):\n        self._test_deformconv(torch.float, device=device, threshold=threshold)\n        # test batch_size < im2col_step\n        self._test_deformconv(\n            torch.float, batch_size=1, im2col_step=2, device=device)\n        # test bach_size % im2col_step != 0\n        with pytest.raises(\n                AssertionError,\n                match='batch size must be divisible by im2col_step'):\n            self._test_deformconv(\n                torch.float, batch_size=10, im2col_step=3, device=device)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    ])\n    def test_deformconv_double(self, device):\n        self._test_deformconv(torch.double, device=device)\n\n    @pytest.mark.parametrize('device, threshold', [\n        pytest.param(\n            'cuda',\n            1e-1,\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            1e-1,\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            1e-1,\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_deformconv_half(self, device, threshold):\n        self._test_deformconv(torch.half, device=device, threshold=threshold)\n        # test amp when torch version >= '1.6.0', the type of\n        # input data for deformconv might be torch.float or torch.half\n        if (TORCH_VERSION != 'parrots'\n                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):\n            with autocast(enabled=True):\n                self._test_amp_deformconv(\n                    torch.float, device=device, threshold=threshold)\n                self._test_amp_deformconv(\n                    torch.half, device=device, threshold=threshold)\n"
  },
  {
    "path": "tests/test_ops/test_deform_roi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n\ncur_dir = os.path.dirname(os.path.abspath(__file__))\n\ninputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,\n                                               1.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],\n              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]\noutputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],\n                                             [0.4375, 0.0625]]]]),\n           ([[[[1., 1.25], [1.5, 1.75]], [[4, 3.75],\n                                          [3.5, 3.25]]]], [[[[3.0625, 0.4375],\n                                                             [0.4375, 0.0625]],\n                                                            [[3.0625, 0.4375],\n                                                             [0.4375,\n                                                              0.0625]]]]),\n           ([[[[1.9375, 4.75],\n               [7.5625,\n                10.375]]]], [[[[0.47265625, 0.4296875, 0.4296875, 0.04296875],\n                               [0.4296875, 0.390625, 0.390625, 0.0390625],\n                               [0.4296875, 0.390625, 0.390625, 0.0390625],\n                               [0.04296875, 0.0390625, 0.0390625,\n                                0.00390625]]]])]\n\n\nclass TestDeformRoIPool:\n\n    def test_deform_roi_pool_gradcheck(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import DeformRoIPoolPack\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n        sampling_ratio = 2\n\n        for case in inputs:\n            np_input = np.array(case[0])\n            np_rois = np.array(case[1])\n\n            x = torch.tensor(\n                np_input, device='cuda', dtype=torch.float, requires_grad=True)\n            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)\n            output_c = x.size(1)\n\n            droipool = DeformRoIPoolPack((pool_h, pool_w),\n                                         output_c,\n                                         spatial_scale=spatial_scale,\n                                         sampling_ratio=sampling_ratio).cuda()\n\n            if _USING_PARROTS:\n                gradcheck(droipool, (x, rois), no_grads=[rois])\n            else:\n                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)\n\n    def test_modulated_deform_roi_pool_gradcheck(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import ModulatedDeformRoIPoolPack\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n        sampling_ratio = 2\n\n        for case in inputs:\n            np_input = np.array(case[0])\n            np_rois = np.array(case[1])\n\n            x = torch.tensor(\n                np_input, device='cuda', dtype=torch.float, requires_grad=True)\n            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)\n            output_c = x.size(1)\n\n            droipool = ModulatedDeformRoIPoolPack(\n                (pool_h, pool_w),\n                output_c,\n                spatial_scale=spatial_scale,\n                sampling_ratio=sampling_ratio).cuda()\n\n            if _USING_PARROTS:\n                gradcheck(droipool, (x, rois), no_grads=[rois])\n            else:\n                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)\n\n    def _test_deform_roi_pool_allclose(self, device, dtype=torch.float):\n        from mmcv.ops import DeformRoIPoolPack\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n        sampling_ratio = 2\n\n        for case, output in zip(inputs, outputs):\n            np_input = np.array(case[0])\n            np_rois = np.array(case[1])\n            np_output = np.array(output[0])\n            np_grad = np.array(output[1])\n\n            x = torch.tensor(\n                np_input, device=device, dtype=torch.float, requires_grad=True)\n            rois = torch.tensor(np_rois, device=device, dtype=torch.float)\n            output_c = x.size(1)\n            droipool = DeformRoIPoolPack(\n                (pool_h, pool_w),\n                output_c,\n                spatial_scale=spatial_scale,\n                sampling_ratio=sampling_ratio).to(device)\n\n            output = droipool(x, rois)\n            output.backward(torch.ones_like(output))\n            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)\n            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n    ])\n    @pytest.mark.parametrize('dtype', [\n        torch.float,\n        pytest.param(\n            torch.double,\n            marks=pytest.mark.skipif(\n                IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,\n                reason='MLU, MUSA does not support for 64-bit floating point'),\n        ), torch.half\n    ])\n    def test_deform_roi_pool_allclose(self, device, dtype):\n        self._test_deform_roi_pool_allclose(device, dtype)\n"
  },
  {
    "path": "tests/test_ops/test_diff_iou_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nif IS_MLU_AVAILABLE:\n    torch.backends.mlu.matmul.allow_tf32 = False\n\n\n# TODO @MTAI there are some bugs for musa!\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_diff_iou_rotated_2d(device):\n    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],\n                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],\n                             [0.5, 0.5, 1., 1., .0]]],\n                           dtype=np.float32)\n    np_boxes2 = np.asarray(\n        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],\n          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],\n          [1.5, 1.5, 1., 1., .0]]],\n        dtype=np.float32)\n\n    boxes1 = torch.from_numpy(np_boxes1).to(device)\n    boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])\n    ious = diff_iou_rotated_2d(boxes1, boxes2)\n    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_diff_iou_rotated_3d(device):\n    np_boxes1 = np.asarray(\n        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],\n          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],\n          [.5, .5, .5, 1., 1., 1., .0]]],\n        dtype=np.float32)\n    np_boxes2 = np.asarray(\n        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],\n          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],\n          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],\n        dtype=np.float32)\n\n    boxes1 = torch.from_numpy(np_boxes1).to(device)\n    boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])\n    ious = diff_iou_rotated_3d(boxes1, boxes2)\n    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_filtered_lrelu.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch\n\nfrom mmcv.ops import filtered_lrelu\nfrom mmcv.utils import IS_MUSA_AVAILABLE\n\n\nclass TestFilteredLrelu:\n\n    @classmethod\n    def setup_class(cls):\n        cls.input_tensor = torch.randn((1, 3, 16, 16), requires_grad=True)\n        cls.bias = torch.randn(3, requires_grad=True)\n        cls.filter_up = torch.randn((2, 2))\n        cls.filter_down = torch.randn((2, 2))\n\n    def test_filtered_lrelu_cpu(self):\n        out = filtered_lrelu(self.input_tensor, bias=self.bias)\n        assert out.shape == (1, 3, 16, 16)\n\n        out = filtered_lrelu(\n            self.input_tensor,\n            bias=self.bias,\n            filter_up=self.filter_up,\n            filter_down=self.filter_down,\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_up\n        filter_up = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor,\n            bias=self.bias,\n            filter_up=filter_up,\n            filter_down=self.filter_down,\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_down\n        filter_down = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor,\n            bias=self.bias,\n            filter_up=self.filter_up,\n            filter_down=filter_down,\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different b\n        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)\n        bias = torch.randn(4, requires_grad=True)\n        out = filtered_lrelu(\n            input_tensor,\n            bias=bias,\n            filter_up=self.filter_up,\n            filter_down=self.filter_down,\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 4, 16, 16)\n\n        # test with different up\n        out = filtered_lrelu(\n            self.input_tensor,\n            bias=self.bias,\n            filter_up=self.filter_up,\n            filter_down=self.filter_down,\n            up=4,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 32, 32)\n\n        # test with different down\n        out = filtered_lrelu(\n            self.input_tensor,\n            bias=self.bias,\n            filter_up=self.filter_up,\n            filter_down=self.filter_down,\n            up=2,\n            down=4,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 8, 8)\n\n        # test with different gain\n        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.2)\n        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.1)\n        assert torch.allclose(out1, 2 * out2)\n\n        # test with different slope\n        out = filtered_lrelu(self.input_tensor, bias=self.bias, slope=0.2)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different clamp\n        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.2)\n        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.1)\n        assert out1.max() <= 0.2\n        assert out2.max() <= 0.1\n\n        # test with different flip_filter\n        out1 = filtered_lrelu(\n            self.input_tensor, bias=self.bias, flip_filter=True)\n        assert out.shape == (1, 3, 16, 16)\n\n    @pytest.mark.skipif(\n        not torch.cuda.is_available() or is_rocm_pytorch()\n        or digit_version(torch.version.cuda) < digit_version('10.2'),\n        reason='requires cuda>=10.2')\n    def test_filtered_lrelu_cuda(self):\n        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())\n        assert out.shape == (1, 3, 16, 16)\n\n        out = filtered_lrelu(\n            self.input_tensor.cuda(),\n            bias=self.bias.cuda(),\n            filter_up=self.filter_up.cuda(),\n            filter_down=self.filter_down.cuda(),\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_up\n        filter_up = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor.cuda(),\n            bias=self.bias.cuda(),\n            filter_up=filter_up.cuda(),\n            filter_down=self.filter_down.cuda(),\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_down\n        filter_down = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor.cuda(),\n            bias=self.bias.cuda(),\n            filter_up=self.filter_up.cuda(),\n            filter_down=filter_down.cuda(),\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different b\n        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)\n        bias = torch.randn(4, requires_grad=True)\n        out = filtered_lrelu(\n            input_tensor.cuda(),\n            bias=bias.cuda(),\n            filter_up=self.filter_up.cuda(),\n            filter_down=self.filter_down.cuda(),\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 4, 16, 16)\n\n        # test with different up\n        out = filtered_lrelu(\n            self.input_tensor.cuda(),\n            bias=self.bias.cuda(),\n            filter_up=self.filter_up.cuda(),\n            filter_down=self.filter_down.cuda(),\n            up=4,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 32, 32)\n\n        # test with different down\n        out = filtered_lrelu(\n            self.input_tensor.cuda(),\n            bias=self.bias.cuda(),\n            filter_up=self.filter_up.cuda(),\n            filter_down=self.filter_down.cuda(),\n            up=2,\n            down=4,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 8, 8)\n\n        # test with different gain\n        out1 = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.2)\n        out2 = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.1)\n        assert torch.allclose(out1, 2 * out2)\n\n        # test with different slope\n        out = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), slope=0.2)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different clamp\n        out1 = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.2)\n        out2 = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.1)\n        assert out1.max() <= 0.2\n        assert out2.max() <= 0.1\n\n        # test with different flip_filter\n        out1 = filtered_lrelu(\n            self.input_tensor.cuda(), bias=self.bias.cuda(), flip_filter=True)\n        assert out.shape == (1, 3, 16, 16)\n\n    @pytest.mark.skipif(not IS_MUSA_AVAILABLE, reason='requires musa')\n    def test_filtered_lrelu_musa(self):\n        out = filtered_lrelu(self.input_tensor.musa(), bias=self.bias.musa())\n        assert out.shape == (1, 3, 16, 16)\n\n        out = filtered_lrelu(\n            self.input_tensor.musa(),\n            bias=self.bias.musa(),\n            filter_up=self.filter_up.musa(),\n            filter_down=self.filter_down.musa(),\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_up\n        filter_up = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor.musa(),\n            bias=self.bias.musa(),\n            filter_up=filter_up.musa(),\n            filter_down=self.filter_down.musa(),\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different filter_down\n        filter_down = torch.randn((4, 4))\n        out = filtered_lrelu(\n            self.input_tensor.musa(),\n            bias=self.bias.musa(),\n            filter_up=self.filter_up.musa(),\n            filter_down=filter_down.musa(),\n            up=2,\n            down=2,\n            padding=2,\n            clamp=0.5)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different b\n        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)\n        bias = torch.randn(4, requires_grad=True)\n        out = filtered_lrelu(\n            input_tensor.musa(),\n            bias=bias.musa(),\n            filter_up=self.filter_up.musa(),\n            filter_down=self.filter_down.musa(),\n            up=2,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 4, 16, 16)\n\n        # test with different up\n        out = filtered_lrelu(\n            self.input_tensor.musa(),\n            bias=self.bias.musa(),\n            filter_up=self.filter_up.musa(),\n            filter_down=self.filter_down.musa(),\n            up=4,\n            down=2,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 32, 32)\n\n        # test with different down\n        out = filtered_lrelu(\n            self.input_tensor.musa(),\n            bias=self.bias.musa(),\n            filter_up=self.filter_up.musa(),\n            filter_down=self.filter_down.musa(),\n            up=2,\n            down=4,\n            padding=1,\n            clamp=0.5)\n        assert out.shape == (1, 3, 8, 8)\n\n        # test with different gain\n        out1 = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), gain=0.2)\n        out2 = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), gain=0.1)\n        assert torch.allclose(out1, 2 * out2)\n\n        # test with different slope\n        out = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), slope=0.2)\n        assert out.shape == (1, 3, 16, 16)\n\n        # test with different clamp\n        out1 = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), clamp=0.2)\n        out2 = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), clamp=0.1)\n        assert out1.max() <= 0.2\n        assert out2.max() <= 0.1\n\n        # test with different flip_filter\n        out1 = filtered_lrelu(\n            self.input_tensor.musa(), bias=self.bias.musa(), flip_filter=True)\n        assert out.shape == (1, 3, 16, 16)\n"
  },
  {
    "path": "tests/test_ops/test_focal_loss.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n\n# torch.set_printoptions(precision=8, threshold=100)\n\ninputs = [\n    ([[1., 0], [0, 1.]], [0, 1]),\n    ([[1., 0, -1.], [0, 1., 2.]], [2, 1]),\n    ([[1e-6, 2e-6, 3e-6], [4e-6, 5e-5, 6e-4], [7e-3, 8e-2, 9e-1]], [1, 2, 0]),\n]\n\nsoftmax_outputs = [(0.00566451, [[-0.00657264, 0.00657264],\n                                 [0.00657264, -0.00657264]]),\n                   (0.34956908, [[0.10165970, 0.03739851, -0.13905823],\n                                 [0.01227554, -0.10298023, 0.09070466]]),\n                   (0.15754992, [[0.02590877, -0.05181759, 0.02590882],\n                                 [0.02589641, 0.02589760, -0.05179400],\n                                 [-0.07307514, 0.02234372, 0.05073142]])]\n\nsigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],\n                                 [0.11185755, -0.00657264]]),\n                   (1.10251057, [[0.28808805, 0.11185755, -0.09602935],\n                                 [0.11185755, -0.00657264, 0.40376765]]),\n                   (0.42287254, [[0.07457182, -0.02485716, 0.07457201],\n                                 [0.07457211, 0.07457669, -0.02483728],\n                                 [-0.02462499, 0.08277918, 0.18050370]])]\n\n\nclass Testfocalloss:\n\n    def _test_softmax(self, dtype=torch.float):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import softmax_focal_loss\n        alpha = 0.25\n        gamma = 2.0\n        for case, output in zip(inputs, softmax_outputs):\n            np_x = np.array(case[0])\n            np_y = np.array(case[1])\n            np_x_grad = np.array(output[1])\n\n            x = torch.from_numpy(np_x).cuda().type(dtype)\n            x.requires_grad_()\n            y = torch.from_numpy(np_y).cuda().long()\n\n            loss = softmax_focal_loss(x, y, gamma, alpha, None, 'mean')\n            loss.backward()\n\n            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)\n            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)\n\n    def _test_sigmoid(self, device, dtype=torch.float):\n        from mmcv.ops import sigmoid_focal_loss\n        alpha = 0.25\n        gamma = 2.0\n        for case, output in zip(inputs, sigmoid_outputs):\n            np_x = np.array(case[0])\n            np_y = np.array(case[1])\n            np_x_grad = np.array(output[1])\n\n            x = torch.from_numpy(np_x).to(device).type(dtype)\n            x.requires_grad_()\n            y = torch.from_numpy(np_y).to(device).long()\n\n            loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')\n            loss.backward()\n\n            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)\n            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)\n\n    def _test_grad_softmax(self, dtype=torch.float):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import SoftmaxFocalLoss\n        alpha = 0.25\n        gamma = 2.0\n        for case in inputs:\n            np_x = np.array(case[0])\n            np_y = np.array(case[1])\n\n            x = torch.from_numpy(np_x).cuda().type(dtype)\n            x.requires_grad_()\n            y = torch.from_numpy(np_y).cuda().long()\n\n            floss = SoftmaxFocalLoss(gamma, alpha)\n            if _USING_PARROTS:\n                # gradcheck(floss, (x, y),\n                #           no_grads=[y])\n                pass\n            else:\n                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)\n\n    def _test_grad_sigmoid(self, dtype=torch.float):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import SigmoidFocalLoss\n        alpha = 0.25\n        gamma = 2.0\n        for case in inputs:\n            np_x = np.array(case[0])\n            np_y = np.array(case[1])\n\n            x = torch.from_numpy(np_x).cuda().type(dtype)\n            x.requires_grad_()\n            y = torch.from_numpy(np_y).cuda().long()\n\n            floss = SigmoidFocalLoss(gamma, alpha)\n            if _USING_PARROTS:\n                # gradcheck(floss, (x, y),\n                #           no_grads=[y])\n                pass\n            else:\n                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)\n\n    def test_softmax_float(self):\n        self._test_softmax(dtype=torch.float)\n\n    def test_softmax_half(self):\n        self._test_softmax(dtype=torch.half)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_sigmoid_float(self, device):\n        self._test_sigmoid(device=device, dtype=torch.float)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_sigmoid_half(self, device):\n        self._test_sigmoid(device, dtype=torch.half)\n\n    def test_grad_softmax_float(self):\n        self._test_grad_softmax(dtype=torch.float)\n\n    def test_grad_sigmoid_float(self):\n        self._test_grad_sigmoid(dtype=torch.float)\n"
  },
  {
    "path": "tests/test_ops/test_furthest_point_sample.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import furthest_point_sample, furthest_point_sample_with_dist\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_fps(device):\n    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],\n                         [-0.8070, 2.4137,\n                          -0.5845], [-1.0001, 2.1982, -0.5859],\n                         [0.3841, 1.8983, -0.7431]],\n                        [[-1.0696, 3.0758,\n                          -0.1899], [-0.2559, 3.5521, -0.1402],\n                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],\n                         [-0.0518, 3.7251, -0.3950]]]).to(device)\n\n    idx = furthest_point_sample(xyz, 3)\n    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).to(device)\n    assert torch.all(idx == expected_idx)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_fps_with_dist(device):\n    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],\n                         [-0.8070, 2.4137,\n                          -0.5845], [-1.0001, 2.1982, -0.5859],\n                         [0.3841, 1.8983, -0.7431]],\n                        [[-1.0696, 3.0758,\n                          -0.1899], [-0.2559, 3.5521, -0.1402],\n                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],\n                         [-0.0518, 3.7251, -0.3950]]]).to(device)\n\n    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).to(device)\n    xyz_square_dist = ((xyz.unsqueeze(dim=1) -\n                        xyz.unsqueeze(dim=2))**2).sum(-1)\n    idx = furthest_point_sample_with_dist(xyz_square_dist, 3)\n    assert torch.all(idx == expected_idx)\n\n    import numpy as np\n    fps_idx = np.load('tests/data/for_3d_ops/fps_idx.npy')\n    features_for_fps_distance = np.load(\n        'tests/data/for_3d_ops/features_for_fps_distance.npy')\n    expected_idx = torch.from_numpy(fps_idx).to(device)\n    features_for_fps_distance = torch.from_numpy(features_for_fps_distance).to(\n        device)\n\n    idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)\n    assert torch.all(idx == expected_idx)\n"
  },
  {
    "path": "tests/test_ops/test_fused_bias_leakyrelu.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck, gradgradcheck\n    _USING_PARROTS = False\n\n\nclass TestFusedBiasLeakyReLU:\n\n    @classmethod\n    def setup_class(cls):\n        if not IS_CUDA_AVAILABLE and not IS_NPU_AVAILABLE \\\n           and not IS_MUSA_AVAILABLE:\n            return\n        if IS_CUDA_AVAILABLE:\n            cls.input_tensor = torch.randn((2, 2, 2, 2),\n                                           requires_grad=True).cuda()\n            cls.bias = torch.zeros(2, requires_grad=True).cuda()\n        elif IS_NPU_AVAILABLE:\n            cls.input_tensor = torch.randn((2, 2, 2, 2),\n                                           requires_grad=True).npu()\n            cls.bias = torch.zeros(2, requires_grad=True).npu()\n        elif IS_MUSA_AVAILABLE:\n            cls.input_tensor = torch.randn((2, 2, 2, 2),\n                                           requires_grad=True).musa()\n            cls.bias = torch.zeros(2, requires_grad=True).musa()\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_gradient(self, device):\n\n        from mmcv.ops import FusedBiasLeakyReLU\n        if _USING_PARROTS:\n            if IS_CUDA_AVAILABLE:\n                gradcheck(\n                    FusedBiasLeakyReLU(2).cuda(),\n                    self.input_tensor,\n                    delta=1e-4,\n                    pt_atol=1e-3)\n        else:\n            gradcheck(\n                FusedBiasLeakyReLU(2).to(device),\n                self.input_tensor,\n                eps=1e-4,\n                atol=1e-3)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_gradgradient(self, device):\n\n        from mmcv.ops import FusedBiasLeakyReLU\n        gradgradcheck(\n            FusedBiasLeakyReLU(2).to(device),\n            self.input_tensor,\n            eps=1e-4,\n            atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_gather_points.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import gather_points\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\nclass TestGatherPoints:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_gather_points_all_close(self, device):\n        features = torch.tensor(\n            [[[\n                -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,\n                -1.4967, -0.4800, 0.2252\n            ],\n              [\n                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,\n                  2.5221, 2.0411, 3.1446\n              ],\n              [\n                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,\n                  -1.4044, -1.4245, -1.4074\n              ]],\n             [[\n                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,\n                 -0.0745, -0.9496, 0.1434\n             ],\n              [\n                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,\n                  1.8778, 1.1437, 1.3639\n              ],\n              [\n                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,\n                  0.3223, -0.6944, -0.5294\n              ]]],\n            dtype=torch.float,\n            device=device)\n        idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],\n                           dtype=torch.int32,\n                           device=device)\n        output = gather_points(features, idx)\n        expected_output = torch.tensor(\n            [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],\n              [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],\n              [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],\n             [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],\n              [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],\n              [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],\n            dtype=torch.float,\n            device=device)\n\n        assert torch.allclose(output, expected_output)\n\n        # test fp16\n        output_half = gather_points(features.half(), idx)\n        assert torch.allclose(output_half, expected_output.half())\n"
  },
  {
    "path": "tests/test_ops/test_group_points.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import grouping_operation\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype', [\n    torch.half,\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=[\n            pytest.mark.skipif(\n                IS_MUSA_AVAILABLE,\n                reason='MUSA does not support for 64-bit floating point')\n        ]),\n])\ndef test_grouping_points(dtype, device):\n    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],\n                         [0, 0, 0]],\n                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],\n                         [0, 0, 0]]]).int().to(device)\n    features = torch.tensor([[[\n        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,\n        0.9268, 0.8414\n    ],\n                              [\n                                  5.4247, 1.5113, 2.3944, 1.4740, 5.0300,\n                                  5.1030, 1.9360, 2.1939, 2.1581, 3.4666\n                              ],\n                              [\n                                  -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,\n                                  -0.5732, -1.0830, -1.7561, -1.6786, -1.6967\n                              ]],\n                             [[\n                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,\n                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527\n                             ],\n                              [\n                                  1.1773, 1.5009, 2.6399, 5.9242, 1.0962,\n                                  2.7346, 6.0865, 1.5555, 4.3303, 2.8229\n                              ],\n                              [\n                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,\n                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386\n                              ]]],\n                            dtype=dtype).to(device)\n    features.requires_grad = True\n\n    output = grouping_operation(features, idx)\n    output.backward(output)\n    grad_features = features.grad\n    expected_output = torch.tensor(\n        [[[[0.5798, 0.5798, 0.5798], [-1.3311, -1.3311, -1.3311],\n           [0.9268, 0.9268, 0.9268], [0.5798, 0.5798, 0.5798],\n           [0.5798, 0.5798, 0.5798], [0.5798, 0.5798, 0.5798]],\n          [[5.4247, 5.4247, 5.4247], [1.4740, 1.4740, 1.4740],\n           [2.1581, 2.1581, 2.1581], [5.4247, 5.4247, 5.4247],\n           [5.4247, 5.4247, 5.4247], [5.4247, 5.4247, 5.4247]],\n          [[-1.6266, -1.6266, -1.6266], [-1.6931, -1.6931, -1.6931],\n           [-1.6786, -1.6786, -1.6786], [-1.6266, -1.6266, -1.6266],\n           [-1.6266, -1.6266, -1.6266], [-1.6266, -1.6266, -1.6266]]],\n         [[[-0.0380, -0.0380, -0.0380], [-0.3693, -0.3693, -0.3693],\n           [-1.8527, -1.8527, -1.8527], [-0.0380, -0.0380, -0.0380],\n           [-0.0380, -0.0380, -0.0380], [-0.0380, -0.0380, -0.0380]],\n          [[1.1773, 1.1773, 1.1773], [6.0865, 6.0865, 6.0865],\n           [2.8229, 2.8229, 2.8229], [1.1773, 1.1773, 1.1773],\n           [1.1773, 1.1773, 1.1773], [1.1773, 1.1773, 1.1773]],\n          [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],\n           [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],\n           [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],\n        dtype=dtype).to(device)\n    expected_grad_features = torch.tensor(\n        [[[\n            6.9576, 0.0000, 0.0000, -3.9933, 0.0000, 0.0000, 0.0000, 0.0000,\n            2.7804, 0.0000\n        ],\n          [\n              65.0964, 0.0000, 0.0000, 4.4220, 0.0000, 0.0000, 0.0000, 0.0000,\n              6.4743, 0.0000\n          ],\n          [\n              -19.5192, 0.0000, 0.0000, -5.0793, 0.0000, 0.0000, 0.0000,\n              0.0000, -5.0358, 0.0000\n          ]],\n         [[\n             -0.4560, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, -1.1079, 0.0000,\n             0.0000, -5.5581\n         ],\n          [\n              14.1276, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 18.2595, 0.0000,\n              0.0000, 8.4687\n          ],\n          [\n              -7.9752, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.4970, 0.0000,\n              0.0000, 0.1158\n          ]]],\n        dtype=dtype).to(device)\n    assert torch.allclose(output, expected_output)\n    assert torch.allclose(grad_features, expected_grad_features)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype', [\n    torch.half,\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MUSA_AVAILABLE,\n            reason='MUSA does not support for 64-bit floating point')),\n])\ndef test_stack_grouping_points(dtype, device):\n    if device == 'npu' and dtype == torch.double:\n        return\n    idx = torch.tensor([[0, 0, 0], [3, 3, 3], [8, 8, 8], [1, 1, 1], [0, 0, 0],\n                        [2, 2, 2], [0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0],\n                        [1, 1, 1], [0, 0, 0]]).int().to(device)\n    features = torch.tensor([[\n        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,\n        0.9268, 0.8414\n    ],\n                             [\n                                 5.4247, 1.5113, 2.3944, 1.4740, 5.0300,\n                                 5.1030, 1.9360, 2.1939, 2.1581, 3.4666\n                             ],\n                             [\n                                 -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,\n                                 -0.5732, -1.0830, -1.7561, -1.6786, -1.6967\n                             ],\n                             [\n                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,\n                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527\n                             ],\n                             [\n                                 1.1773, 1.5009, 2.6399, 5.9242, 1.0962,\n                                 2.7346, 6.0865, 1.5555, 4.3303, 2.8229\n                             ],\n                             [\n                                 -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,\n                                 -1.4049, 0.4990, -0.7037, -0.9924, 0.0386\n                             ]],\n                            dtype=dtype).to(device)\n    features_batch_cnt = torch.tensor([3, 3]).int().to(device)\n    indices_batch_cnt = torch.tensor([6, 6]).int().to(device)\n    output = grouping_operation(features, idx, features_batch_cnt,\n                                indices_batch_cnt)\n    expected_output = torch.tensor(\n        [[[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],\n          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],\n          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],\n          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],\n          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],\n         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],\n         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],\n         [[5.4247, 5.4247, 5.4247], [1.5113, 1.5113, 1.5113],\n          [2.3944, 2.3944, 2.3944], [1.4740, 1.4740, 1.4740],\n          [5.0300, 5.0300, 5.0300], [5.1030, 5.1030, 5.1030],\n          [1.9360, 1.9360, 1.9360], [2.1939, 2.1939, 2.1939],\n          [2.1581, 2.1581, 2.1581], [3.4666, 3.4666, 3.4666]],\n         [[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],\n          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],\n          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],\n          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],\n          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],\n         [[-1.6266, -1.6266, -1.6266], [-1.0281, -1.0281, -1.0281],\n          [-1.0393, -1.0393, -1.0393], [-1.6931, -1.6931, -1.6931],\n          [-1.3982, -1.3982, -1.3982], [-0.5732, -0.5732, -0.5732],\n          [-1.0830, -1.0830, -1.0830], [-1.7561, -1.7561, -1.7561],\n          [-1.6786, -1.6786, -1.6786], [-1.6967, -1.6967, -1.6967]],\n         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],\n          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],\n          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],\n          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],\n          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],\n         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],\n         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],\n          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],\n         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],\n          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],\n          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],\n          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],\n          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],\n         [[1.1773, 1.1773, 1.1773], [1.5009, 1.5009, 1.5009],\n          [2.6399, 2.6399, 2.6399], [5.9242, 5.9242, 5.9242],\n          [1.0962, 1.0962, 1.0962], [2.7346, 2.7346, 2.7346],\n          [6.0865, 6.0865, 6.0865], [1.5555, 1.5555, 1.5555],\n          [4.3303, 4.3303, 4.3303], [2.8229, 2.8229, 2.8229]],\n         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],\n          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],\n          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],\n          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],\n          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]]],\n        dtype=dtype).to(device)\n    assert torch.allclose(output, expected_output)\n"
  },
  {
    "path": "tests/test_ops/test_info.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\n\n\nclass TestInfo:\n\n    def test_info(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import get_compiler_version, get_compiling_cuda_version\n        cv = get_compiler_version()\n        ccv = get_compiling_cuda_version()\n        assert cv is not None\n        assert ccv is not None\n"
  },
  {
    "path": "tests/test_ops/test_iou3d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))\n])\ndef test_boxes_overlap_bev(device):\n    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],\n                           dtype=np.float32)\n    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],\n                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],\n                           dtype=np.float32)\n    np_expect_overlaps = np.asarray(\n        [[4.0, 4.0, (8 + 8 * 2**0.5) /\n          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],\n        dtype=np.float32)\n\n    boxes1 = torch.from_numpy(np_boxes1).to(device)\n    boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n    # test for 3 boxes\n    overlaps = boxes_overlap_bev(boxes1, boxes2)\n    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)\n\n    # test for many boxes\n    boxes2 = boxes2.repeat_interleave(555, 0)\n\n    overlaps = boxes_overlap_bev(boxes1, boxes2)\n    assert np.allclose(\n        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_boxes_iou3d(device):\n    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],\n                           dtype=np.float32)\n    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],\n                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],\n                           dtype=np.float32)\n    np_expect_ious = np.asarray(\n        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],\n         [0.0, 0.0, 0.0]],\n        dtype=np.float32)\n\n    boxes1 = torch.from_numpy(np_boxes1).to(device)\n    boxes2 = torch.from_numpy(np_boxes2).to(device)\n\n    ious = boxes_iou3d(boxes1, boxes2)\n    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_nms3d(device):\n    # test for 5 boxes\n    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],\n                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],\n                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],\n                          dtype=np.float32)\n    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)\n    np_inds = np.array([1, 0, 3])\n    boxes = torch.from_numpy(np_boxes)\n    scores = torch.from_numpy(np_scores)\n    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)\n\n    assert np.allclose(inds.cpu().numpy(), np_inds)\n\n    # test for many boxes\n    # In the float data type calculation process, float will be converted to\n    # double in CUDA kernel (https://github.com/open-mmlab/mmcv/blob\n    # /master/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp#L61),\n    # always use float in MLU kernel. The difference between the mentioned\n    # above leads to different results.\n    if device != 'mlu':\n        np.random.seed(42)\n        np_boxes = np.random.rand(555, 7).astype(np.float32)\n        np_scores = np.random.rand(555).astype(np.float32)\n        boxes = torch.from_numpy(np_boxes)\n        scores = torch.from_numpy(np_scores)\n        inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)\n\n        assert len(inds.cpu().numpy()) == 176\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_nms3d_normal(device):\n    # test for 5 boxes\n    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],\n                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],\n                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],\n                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],\n                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],\n                          dtype=np.float32)\n    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)\n    np_inds = np.array([1, 0, 3])\n    boxes = torch.from_numpy(np_boxes)\n    scores = torch.from_numpy(np_scores)\n    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)\n\n    assert np.allclose(inds.cpu().numpy(), np_inds)\n\n    # test for many boxes\n    np.random.seed(42)\n    np_boxes = np.random.rand(555, 7).astype(np.float32)\n    np_scores = np.random.rand(555).astype(np.float32)\n    boxes = torch.from_numpy(np_boxes)\n    scores = torch.from_numpy(np_scores)\n    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)\n\n    assert len(inds.cpu().numpy()) == 148\n"
  },
  {
    "path": "tests/test_ops/test_knn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import knn\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_knn(device):\n    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],\n                             [-2.2769, 2.7817, -0.2334],\n                             [-0.4003, 2.4666, -0.5116],\n                             [-0.0740, 1.3147, -1.3625],\n                             [-0.0740, 1.3147, -1.3625]],\n                            [[-2.0289, 2.4952, -0.1708],\n                             [-2.0668, 6.0278, -0.4875],\n                             [0.4066, 1.4211, -0.2947],\n                             [-2.0289, 2.4952, -0.1708],\n                             [-2.0289, 2.4952, -0.1708]]]).to(device)\n\n    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],\n                         [-0.4003, 2.4666,\n                          -0.5116], [-0.5251, 2.4379, -0.8466],\n                         [-0.9691, 1.1418,\n                          -1.3733], [-0.2232, 0.9561, -1.3626],\n                         [-2.2769, 2.7817, -0.2334],\n                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],\n                         [0.4917, 1.1529, -1.3496]],\n                        [[-2.0289, 2.4952,\n                          -0.1708], [-0.7188, 0.9956, -0.5096],\n                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],\n                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],\n                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],\n                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,\n                                                    -1.2000]]]).to(device)\n\n    idx = knn(5, xyz, new_xyz)\n    new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)\n    xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)\n    dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)\n    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)\n    assert torch.all(idx == expected_idx)\n\n    idx = knn(5,\n              xyz.transpose(1, 2).contiguous(),\n              new_xyz.transpose(1, 2).contiguous(), True)\n    assert torch.all(idx == expected_idx)\n\n    idx = knn(5, xyz, xyz)\n    xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)\n    xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)\n    dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)\n    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)\n    assert torch.all(idx == expected_idx)\n"
  },
  {
    "path": "tests/test_ops/test_masked_conv2d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nif IS_MLU_AVAILABLE:\n    torch.backends.cnnl.allow_tf32 = False\n    torch.backends.mlu.matmul.allow_tf32 = False\n\n\nclass TestMaskedConv2d:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_masked_conv2d_all_close(self, device):\n        from mmcv.ops import MaskedConv2d\n        np_input = np.load(\n            'tests/data/for_masked_conv2d/masked_conv2d_for_input.npy')\n        np_mask = np.load(\n            'tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy')\n        np_weight = np.load(\n            'tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy')\n        np_bias = np.load(\n            'tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy')\n        np_output = np.load(\n            'tests/data/for_masked_conv2d/masked_conv2d_for_output.npy')\n        input = torch.tensor(np_input, dtype=torch.float, device=device)\n        mask = torch.tensor(np_mask, dtype=torch.float, device=device)\n        weight = torch.tensor(np_weight, dtype=torch.float, device=device)\n        bias = torch.tensor(np_bias, dtype=torch.float, device=device)\n        conv = MaskedConv2d(3, 3, 3, 1, 1).to(device)\n        conv.weight = torch.nn.Parameter(weight)\n        conv.bias = torch.nn.Parameter(bias)\n        output = conv(input, mask)\n        assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_merge_cells.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\n\"\"\"\nCommandLine:\n    pytest tests/test_merge_cells.py\n\"\"\"\nimport math\n\nimport pytest\nimport torch\nimport torch.nn.functional as F\n\nfrom mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,\n                                  SumCell)\n\n\n# All size (14, 7) below is to test the situation that\n# the input size can't be divisible by the target size.\n@pytest.mark.parametrize(\n    'inputs_x, inputs_y',\n    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),\n     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])\ndef test_sum_cell(inputs_x, inputs_y):\n    sum_cell = SumCell(256, 256)\n    output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])\n    assert output.size() == inputs_x.size()\n    output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])\n    assert output.size() == inputs_y.size()\n    output = sum_cell(inputs_x, inputs_y)\n    assert output.size() == inputs_y.size()\n\n\n@pytest.mark.parametrize(\n    'inputs_x, inputs_y',\n    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),\n     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])\ndef test_concat_cell(inputs_x, inputs_y):\n    concat_cell = ConcatCell(256, 256)\n    output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])\n    assert output.size() == inputs_x.size()\n    output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])\n    assert output.size() == inputs_y.size()\n    output = concat_cell(inputs_x, inputs_y)\n    assert output.size() == inputs_y.size()\n\n\n@pytest.mark.parametrize(\n    'inputs_x, inputs_y',\n    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),\n     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])\ndef test_global_pool_cell(inputs_x, inputs_y):\n    gp_cell = GlobalPoolingCell(with_out_conv=False)\n    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])\n    assert (gp_cell_out.size() == inputs_x.size())\n    gp_cell = GlobalPoolingCell(256, 256)\n    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])\n    assert (gp_cell_out.size() == inputs_x.size())\n\n\n@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),\n                                         (14, 7)])\ndef test_resize_methods(target_size):\n    inputs_x = torch.randn([2, 256, 128, 128])\n    h, w = inputs_x.shape[-2:]\n    target_h, target_w = target_size\n    if (h <= target_h) or w <= target_w:\n        rs_mode = 'upsample'\n    else:\n        rs_mode = 'downsample'\n\n    if rs_mode == 'upsample':\n        upsample_methods_list = ['nearest', 'bilinear']\n        for method in upsample_methods_list:\n            merge_cell = BaseMergeCell(upsample_mode=method)\n            merge_cell_out = merge_cell._resize(inputs_x, target_size)\n            gt_out = F.interpolate(inputs_x, size=target_size, mode=method)\n            assert merge_cell_out.equal(gt_out)\n    elif rs_mode == 'downsample':\n        merge_cell = BaseMergeCell()\n        merge_cell_out = merge_cell._resize(inputs_x, target_size)\n        if h % target_h != 0 or w % target_w != 0:\n            pad_h = math.ceil(h / target_h) * target_h - h\n            pad_w = math.ceil(w / target_w) * target_w - w\n            pad_l = pad_w // 2\n            pad_r = pad_w - pad_l\n            pad_t = pad_h // 2\n            pad_b = pad_h - pad_t\n            pad = (pad_l, pad_r, pad_t, pad_b)\n            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)\n        kernel_size = (inputs_x.shape[-2] // target_h,\n                       inputs_x.shape[-1] // target_w)\n        gt_out = F.max_pool2d(\n            inputs_x, kernel_size=kernel_size, stride=kernel_size)\n        print(merge_cell_out.shape, gt_out.shape)\n        assert (merge_cell_out == gt_out).all()\n        assert merge_cell_out.shape[-2:] == target_size\n"
  },
  {
    "path": "tests/test_ops/test_min_area_polygons.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import min_area_polygons\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\nnp_pointsets = np.asarray([[\n    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,\n    2.0, 1.5, 1.5\n],\n                           [\n                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,\n                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5\n                           ]])\n\nexpected_polygons = np.asarray(\n    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],\n     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_min_area_polygons(device):\n    pointsets = torch.from_numpy(np_pointsets).to(device).float()\n\n    assert np.allclose(\n        min_area_polygons(pointsets).cpu().numpy(),\n        expected_polygons,\n        atol=1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_modulated_deform_conv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nimport numpy\nimport pytest\nimport torch\nfrom mmengine.utils import digit_version\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nif IS_MUSA_AVAILABLE:\n    try:\n        from torch_musa.core.amp import autocast\n    except ImportError:\n        pass\nelse:\n    try:\n        # If PyTorch version >= 1.6.0 and fp16 is enabled,\n        # torch.cuda.amp.autocast would be imported and used;\n        # we should test if our modules support it.\n        from torch.cuda.amp import autocast\n    except ImportError:\n        pass\n\ncur_dir = os.path.dirname(os.path.abspath(__file__))\n\ninput_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]\noutput_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],\n              [0.5, 1.5, 2.5, 1.5]]]]\ninput_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]\ndcn_w_grad = [[[[9., 9.], [9., 9.]]]]\ndcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,\n                                                                   5.0]]],\n                     [[[-4.0, -7.0], [0.0, 0.0]]],\n                     [[[-7.5, -9.0], [-5.0, -6.0]]],\n                     [[[-7.0, -4.0], [-7.0, -4.0]]],\n                     [[[-6.0, 5.0], [-9.0, 7.5]]],\n                     [[[-4.0, -7.0], [-4.0, -7.0]]],\n                     [[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,\n                                                                     4.0]]],\n                     [[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],\n                     [[[4.0, 7.0], [6.0, 10.5]]]]\ndcn_offset_b_grad = [\n    -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5\n]\n\n\nclass TestMdconv:\n\n    def _test_mdconv(self, dtype=torch.float, device='cuda'):\n        if not torch.cuda.is_available() and device == 'cuda':\n            pytest.skip('test requires GPU')\n        if device == 'mlu':\n            from mmcv.ops import \\\n                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack\n        else:\n            from mmcv.ops import ModulatedDeformConv2dPack\n\n        input = torch.tensor(input_t, dtype=dtype, device=device)\n        input.requires_grad = True\n\n        dcn = ModulatedDeformConv2dPack(\n            1,\n            1,\n            kernel_size=(2, 2),\n            stride=1,\n            padding=1,\n            deform_groups=1,\n            bias=False).to(device)\n\n        dcn.weight.data.fill_(1.)\n        dcn.type(dtype)\n        output = dcn(input)\n        output.sum().backward()\n        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)\n        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,\n                              1e-2)\n        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),\n                              dcn_w_grad, 1e-2)\n        assert numpy.allclose(\n            dcn.conv_offset.weight.grad.cpu().detach().numpy(),\n            dcn_offset_w_grad, 1e-2)\n        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),\n                              dcn_offset_b_grad, 1e-2)\n\n    def _test_amp_mdconv(self, input_dtype=torch.float, device='cuda'):\n        \"\"\"The function to test amp released on pytorch 1.6.0.\n\n        The type of input data might be torch.float or torch.half,\n        so we should test mdconv in both cases. With amp, the data\n        type of model will NOT be set manually.\n\n        Args:\n            input_dtype: torch.float or torch.half.\n        \"\"\"\n        if not torch.cuda.is_available() and device == 'cuda':\n            return\n        if device == 'mlu':\n            from mmcv.ops import \\\n                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack\n        else:\n            from mmcv.ops import ModulatedDeformConv2dPack\n\n        input = torch.tensor(input_t).to(device).type(input_dtype)\n        input.requires_grad = True\n\n        dcn = ModulatedDeformConv2dPack(\n            1,\n            1,\n            kernel_size=(2, 2),\n            stride=1,\n            padding=1,\n            deform_groups=1,\n            bias=False).to(device)\n        dcn.weight.data.fill_(1.)\n        output = dcn(input)\n        output.sum().backward()\n        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)\n        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,\n                              1e-2)\n        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),\n                              dcn_w_grad, 1e-2)\n        assert numpy.allclose(\n            dcn.conv_offset.weight.grad.cpu().detach().numpy(),\n            dcn_offset_w_grad, 1e-2)\n        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),\n                              dcn_offset_b_grad, 1e-2)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_mdconv_float(self, device):\n        self._test_mdconv(dtype=torch.float, device=device)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    ])\n    def test_mdconv_double(self, device):\n        self._test_mdconv(dtype=torch.double, device=device)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_mdconv_half(self, device):\n        self._test_mdconv(torch.half, device=device)\n        # test amp when torch version >= '1.6.0', the type of\n        # input data for mdconv might be torch.float or torch.half\n        if (TORCH_VERSION != 'parrots'\n                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):\n            with autocast(enabled=True):\n                self._test_amp_mdconv(torch.float, device=device)\n                self._test_amp_mdconv(torch.half, device=device)\n"
  },
  {
    "path": "tests/test_ops/test_ms_deformable_attn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops.multi_scale_deform_attn import (\n    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,\n    multi_scale_deformable_attn_pytorch)\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n_USING_PARROTS = True\n_IS_AUTOCAST_AVAILABLE = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n\nif IS_MUSA_AVAILABLE:\n    try:\n        from torch.musa.amp import autocast\n    except ImportError:\n        _IS_AUTOCAST_AVAILABLE = False\n        pass\nelse:\n    try:\n        # If PyTorch version >= 1.6.0 and fp16 is enabled,\n        # torch.cuda.amp.autocast would be imported and used;\n        # we should test if our modules support it.\n        from torch.cuda.amp import autocast\n    except ImportError:\n        _IS_AUTOCAST_AVAILABLE = False\n        pass\n\n\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda:0',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_multiscale_deformable_attention(device):\n    with pytest.raises(ValueError):\n        # embed_dims must be divisible by num_heads,\n        MultiScaleDeformableAttention(\n            embed_dims=256,\n            num_heads=7,\n        )\n    device = torch.device(device)\n    msda = MultiScaleDeformableAttention(\n        embed_dims=3, num_levels=2, num_heads=3)\n    msda.init_weights()\n    num_query = 5\n    bs = 1\n    embed_dims = 3\n    query = torch.rand(num_query, bs, embed_dims).to(device)\n    key = torch.rand(num_query, bs, embed_dims).to(device)\n    spatial_shapes = torch.Tensor([[2, 2], [1, 1]]).long().to(device)\n    level_start_index = torch.Tensor([0, 4]).long().to(device)\n    reference_points = torch.rand(bs, num_query, 2, 2).to(device)\n    msda.to(device)\n    msda(\n        query,\n        key,\n        key,\n        reference_points=reference_points,\n        spatial_shapes=spatial_shapes,\n        level_start_index=level_start_index)\n\n    # test with value_proj_ratio\n    embed_dims = 6\n    value_proj_ratio = 0.5\n    query = torch.rand(num_query, bs, embed_dims).to(device)\n    key = torch.rand(num_query, bs, embed_dims).to(device)\n    msda = MultiScaleDeformableAttention(\n        embed_dims=embed_dims,\n        num_levels=2,\n        num_heads=3,\n        value_proj_ratio=value_proj_ratio)\n    msda.init_weights()\n    msda.to(device)\n    msda(\n        query,\n        key,\n        key,\n        reference_points=reference_points,\n        spatial_shapes=spatial_shapes,\n        level_start_index=level_start_index)\n\n\ndef test_forward_multi_scale_deformable_attn_pytorch():\n    N, M, D = 1, 2, 2\n    Lq, L, P = 2, 2, 2\n    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n\n    multi_scale_deformable_attn_pytorch(value.double(), shapes,\n                                        sampling_locations.double(),\n                                        attention_weights.double()).detach()\n\n\n@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')\ndef test_forward_equal_with_pytorch_double():\n    N, M, D = 1, 2, 2\n    Lq, L, P = 2, 2, 2\n    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = multi_scale_deformable_attn_pytorch(\n        value.double(), shapes, sampling_locations.double(),\n        attention_weights.double()).detach().cpu()\n\n    output_cuda = MultiScaleDeformableAttnFunction.apply(\n        value.cuda().double(), shapes.cuda(), level_start_index.cuda(),\n        sampling_locations.cuda().double(),\n        attention_weights.cuda().double(), im2col_step).detach().cpu()\n    assert torch.allclose(output_cuda, output_pytorch)\n    max_abs_err = (output_cuda - output_pytorch).abs().max()\n    max_rel_err = ((output_cuda - output_pytorch).abs() /\n                   output_pytorch.abs()).max()\n    assert max_abs_err < 1e-18\n    assert max_rel_err < 1e-15\n\n\n@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')\ndef test_forward_equal_with_pytorch_npu():\n    N, M, D = 6, 4, 8\n    Lq, L, P = 10000, 4, 8\n    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],\n                             dtype=torch.int32)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = multi_scale_deformable_attn_pytorch(\n        value.float(), shapes, sampling_locations.float(),\n        attention_weights.float()).detach().cpu()\n\n    output_npu = MultiScaleDeformableAttnFunction.apply(\n        value.npu().float(), shapes.npu(), level_start_index.npu(),\n        sampling_locations.npu().float(),\n        attention_weights.npu().float(), im2col_step).detach().cpu()\n    assert torch.allclose(output_npu, output_pytorch)\n    max_abs_err = (output_npu - output_pytorch).abs().max()\n    max_rel_err = ((output_npu - output_pytorch).abs() /\n                   output_pytorch.abs()).max()\n    assert max_abs_err < 1e-18\n    assert max_rel_err < 1e-15\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_forward_equal_with_pytorch_float(device):\n    N, M, D = 1, 2, 2\n    Lq, L, P = 2, 2, 2\n    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = multi_scale_deformable_attn_pytorch(\n        value, shapes, sampling_locations, attention_weights).detach().cpu()\n\n    output_device = MultiScaleDeformableAttnFunction.apply(\n        value.to(device), shapes.to(device), level_start_index.to(device),\n        sampling_locations.to(device), attention_weights.to(device),\n        im2col_step).detach().cpu()\n    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)\n    max_abs_err = (output_device - output_pytorch).abs().max()\n    max_rel_err = ((output_device - output_pytorch).abs() /\n                   output_pytorch.abs()).max()\n    assert max_abs_err < 1e-9\n    assert max_rel_err < 1e-6\n\n\n@pytest.mark.skipif(\n    not _IS_AUTOCAST_AVAILABLE, reason='requires autocast support')\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_forward_equal_with_autocast(device):\n    N, M, D = 1, 2, 2\n    Lq, L, P = 2, 2, 2\n    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n    output_pytorch = multi_scale_deformable_attn_pytorch(\n        value, shapes, sampling_locations, attention_weights).detach().cpu()\n\n    # float test\n    dtype = torch.float\n    with autocast(enabled=True):\n        output_device = MultiScaleDeformableAttnFunction.apply(\n            value.to(device).type(dtype), shapes.to(device),\n            level_start_index.to(device), sampling_locations.to(device),\n            attention_weights.to(device), im2col_step).detach().cpu()\n    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)\n    max_abs_err = (output_device - output_pytorch).abs().max()\n    max_rel_err = ((output_device - output_pytorch).abs() /\n                   output_pytorch.abs()).max()\n    assert max_abs_err < 1e-9\n    assert max_rel_err < 1e-6\n\n    # half test\n    dtype = torch.half\n    with autocast(enabled=True):\n        output_device = MultiScaleDeformableAttnFunction.apply(\n            value.to(device).type(dtype), shapes.to(device),\n            level_start_index.to(device), sampling_locations.to(device),\n            attention_weights.to(device), im2col_step).detach().cpu()\n    assert torch.allclose(\n        output_device, output_pytorch.half(), rtol=1e-2, atol=1e-3)\n    max_abs_err = (output_device - output_pytorch).abs().max()\n    max_rel_err = ((output_device - output_pytorch).abs() /\n                   output_pytorch.abs()).max()\n    assert max_abs_err < 1e-5\n    assert max_rel_err < 1e-2\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype', [\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='MLU, MUSA does not support for 64-bit floating point')),\n    torch.half\n])\n@pytest.mark.parametrize('channels', [\n    4,\n    30,\n    32,\n    64,\n    71,\n    1025,\n])\ndef test_gradient_numerical(channels,\n                            device,\n                            dtype,\n                            grad_value=True,\n                            grad_sampling_loc=True,\n                            grad_attn_weight=True):\n\n    N, M, _ = 1, 2, 2\n    Lq, L, P = 2, 2, 2\n    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).to(device)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    value = torch.rand(N, S, M, channels).to(device) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2).to(device)\n    attention_weights = torch.rand(N, Lq, M, L, P).to(device) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n\n    func = MultiScaleDeformableAttnFunction.apply\n\n    value.requires_grad = grad_value\n    sampling_locations.requires_grad = grad_sampling_loc\n    attention_weights.requires_grad = grad_attn_weight\n    if device == 'cuda':\n        dtype = torch.double\n        eps = 1e-6\n    elif device == 'mlu':\n        dtype = torch.float\n        eps = 1e-4\n    elif device == 'musa':\n        dtype = torch.float\n        eps = 1e-4\n    if _USING_PARROTS:\n        assert gradcheck(\n            func, (value.to(dtype), shapes, level_start_index,\n                   sampling_locations.to(dtype), attention_weights.to(dtype),\n                   im2col_step),\n            no_grads=[shapes, level_start_index],\n            eps=eps)\n    else:\n        assert gradcheck(\n            func, (value.to(dtype), shapes, level_start_index,\n                   sampling_locations.to(dtype), attention_weights.to(dtype),\n                   im2col_step),\n            eps=eps,\n            atol=1e-2)\n\n\n@pytest.mark.skipif(not IS_NPU_AVAILABLE, reason='requires NPU support')\ndef test_backward_equal_with_pytorch_npu():\n    N, M, D = 6, 4, 8\n    Lq, L, P = 10000, 4, 8\n    shapes = torch.as_tensor([(60, 40), (30, 20), (16, 24), (53, 32)],\n                             dtype=torch.int32)\n    level_start_index = torch.cat((shapes.new_zeros(\n        (1, )), shapes.prod(1).cumsum(0)[:-1]))\n    S = sum((H * W).item() for H, W in shapes)\n\n    torch.manual_seed(3)\n    value = torch.rand(N, S, M, D) * 0.01\n    sampling_locations = torch.rand(N, Lq, M, L, P, 2)\n    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5\n    attention_weights /= attention_weights.sum(\n        -1, keepdim=True).sum(\n            -2, keepdim=True)\n    im2col_step = 2\n    value.requires_grad = True\n    sampling_locations.requires_grad = True\n    attention_weights.requires_grad = True\n    output_pytorch = multi_scale_deformable_attn_pytorch(\n        value.float(), shapes, sampling_locations.float(),\n        attention_weights.float())\n    grad_output_pytorch = torch.ones_like(output_pytorch)\n    output_pytorch.backward(grad_output_pytorch)\n    grad_value = value.grad.detach().cpu()\n    grad_location = sampling_locations.grad.detach().cpu()\n    grad_attn_weight = attention_weights.grad.detach().cpu()\n\n    value_npu = value.npu()\n    shapes_npu = shapes.npu()\n    level_start_index_npu = level_start_index.npu()\n    sampling_locations_npu = sampling_locations.npu()\n    attention_weights_npu = attention_weights.npu()\n    output_npu = MultiScaleDeformableAttnFunction.apply(\n        value_npu.float(), shapes_npu, level_start_index_npu,\n        sampling_locations_npu.float(), attention_weights_npu.float(),\n        im2col_step)\n    grad_output_npu = torch.ones_like(output_npu)\n    output_npu.backward(grad_output_npu)\n    grad_value_npu = value_npu.grad.detach().cpu()\n    grad_location_npu = sampling_locations_npu.grad.detach().cpu()\n    grad_attn_weight_npu = attention_weights_npu.grad.detach().cpu()\n    assert torch.allclose(grad_value_npu, grad_value)\n    max_abs_err_1 = (grad_value_npu - grad_value).abs().max()\n    max_rel_err_1 = ((grad_value_npu - grad_value).abs() /\n                     grad_value.abs()).max()\n    assert max_abs_err_1 < 1e-5\n    assert max_rel_err_1 < 1e-4\n    assert torch.allclose(grad_location_npu, grad_location)\n    max_abs_err_2 = (grad_location_npu - grad_location).abs().max()\n    max_rel_err_2 = ((grad_location_npu - grad_location).abs() /\n                     grad_location.abs()).max()\n    assert max_abs_err_2 < 1e-5\n    assert max_rel_err_2 < 1e-4\n    assert torch.allclose(grad_attn_weight_npu, grad_attn_weight)\n    max_abs_err_3 = (grad_attn_weight_npu - grad_attn_weight).abs().max()\n    max_rel_err_3 = ((grad_attn_weight_npu - grad_attn_weight).abs() /\n                     grad_attn_weight.abs()).max()\n    assert max_abs_err_3 < 1e-5\n    assert max_rel_err_3 < 1e-4\n"
  },
  {
    "path": "tests/test_ops/test_nms.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport mmengine\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\n\nclass Testnms:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n    ])\n    def test_nms_allclose(self, device):\n        from mmcv.ops import nms\n        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],\n                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],\n                            dtype=np.float32)\n        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)\n        np_inds = np.array([1, 0, 3])\n        np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],\n                            [6.0, 3.0, 8.0, 7.0, 0.6],\n                            [1.0, 4.0, 13.0, 7.0, 0.2]])\n        boxes = torch.from_numpy(np_boxes)\n        scores = torch.from_numpy(np_scores)\n        dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)\n        assert np.allclose(dets, np_dets)  # test cpu\n        assert np.allclose(inds, np_inds)  # test cpu\n        dets, inds = nms(\n            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)\n        assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu\n        assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu\n\n    def test_softnms_allclose(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import soft_nms\n        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],\n                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],\n                            dtype=np.float32)\n        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)\n\n        np_output = {\n            'linear': {\n                'dets':\n                np.array(\n                    [[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],\n                     [3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],\n                    dtype=np.float32),\n                'inds':\n                np.array([1, 0, 2, 3], dtype=np.int64)\n            },\n            'gaussian': {\n                'dets':\n                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],\n                          [3., 7., 10., 12., 0.35275510],\n                          [1., 4., 13., 7., 0.18650459]],\n                         dtype=np.float32),\n                'inds':\n                np.array([1, 0, 2, 3], dtype=np.int64)\n            },\n            'naive': {\n                'dets':\n                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],\n                          [1., 4., 13., 7., 0.2]],\n                         dtype=np.float32),\n                'inds':\n                np.array([1, 0, 3], dtype=np.int64)\n            }\n        }\n\n        boxes = torch.from_numpy(np_boxes)\n        scores = torch.from_numpy(np_scores)\n\n        configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],\n                   [0.3, 0.5, 0.01, 'naive']]\n\n        for iou, sig, mscore, m in configs:\n            dets, inds = soft_nms(\n                boxes,\n                scores,\n                iou_threshold=iou,\n                sigma=sig,\n                min_score=mscore,\n                method=m)\n            assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])\n            assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])\n\n        if torch.__version__ != 'parrots':\n            boxes = boxes.cuda()\n            scores = scores.cuda()\n            for iou, sig, mscore, m in configs:\n                dets, inds = soft_nms(\n                    boxes,\n                    scores,\n                    iou_threshold=iou,\n                    sigma=sig,\n                    min_score=mscore,\n                    method=m)\n                assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])\n                assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])\n\n    def test_nms_match(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import nms, nms_match\n        iou_thr = 0.6\n        # empty input\n        empty_dets = np.array([])\n        assert len(nms_match(empty_dets, iou_thr)) == 0\n\n        # non empty ndarray input\n        np_dets = np.array(\n            [[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],\n             [35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],\n            dtype=np.float32)\n        np_groups = nms_match(np_dets, iou_thr)\n        assert isinstance(np_groups[0], np.ndarray)\n        assert len(np_groups) == 2\n        tensor_dets = torch.from_numpy(np_dets)\n        boxes = tensor_dets[:, :4]\n        scores = tensor_dets[:, 4]\n        nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),\n                            iou_thr)[1]\n        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())\n\n        # non empty tensor input\n        tensor_dets = torch.from_numpy(np_dets)\n        tensor_groups = nms_match(tensor_dets, iou_thr)\n        assert isinstance(tensor_groups[0], torch.Tensor)\n        for i in range(len(tensor_groups)):\n            assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()\n\n        # input of wrong shape\n        wrong_dets = np.zeros((2, 3))\n        with pytest.raises(AssertionError):\n            nms_match(wrong_dets, iou_thr)\n\n    def test_batched_nms(self):\n        from mmcv.ops import batched_nms\n        results = mmengine.load('./tests/data/batched_nms_data.pkl')\n\n        nms_max_num = 100\n        nms_cfg = dict(\n            type='nms',\n            iou_threshold=0.7,\n            score_threshold=0.5,\n            max_num=nms_max_num)\n        boxes, keep = batched_nms(\n            torch.from_numpy(results['boxes']),\n            torch.from_numpy(results['scores']),\n            torch.from_numpy(results['idxs']),\n            nms_cfg,\n            class_agnostic=False)\n\n        nms_cfg.update(split_thr=100)\n        seq_boxes, seq_keep = batched_nms(\n            torch.from_numpy(results['boxes']),\n            torch.from_numpy(results['scores']),\n            torch.from_numpy(results['idxs']),\n            nms_cfg,\n            class_agnostic=False)\n\n        assert torch.equal(keep, seq_keep)\n        assert torch.equal(boxes, seq_boxes)\n        assert torch.equal(keep,\n                           torch.from_numpy(results['keep'][:nms_max_num]))\n\n        nms_cfg = dict(type='soft_nms', iou_threshold=0.7)\n        boxes, keep = batched_nms(\n            torch.from_numpy(results['boxes']),\n            torch.from_numpy(results['scores']),\n            torch.from_numpy(results['idxs']),\n            nms_cfg,\n            class_agnostic=False)\n\n        nms_cfg.update(split_thr=100)\n        seq_boxes, seq_keep = batched_nms(\n            torch.from_numpy(results['boxes']),\n            torch.from_numpy(results['scores']),\n            torch.from_numpy(results['idxs']),\n            nms_cfg,\n            class_agnostic=False)\n\n        assert torch.equal(keep, seq_keep)\n        assert torch.equal(boxes, seq_boxes)\n\n        # test skip nms when `nms_cfg` is None\n        seq_boxes, seq_keep = batched_nms(\n            torch.from_numpy(results['boxes']),\n            torch.from_numpy(results['scores']),\n            torch.from_numpy(results['idxs']),\n            None,\n            class_agnostic=False)\n        assert len(seq_keep) == len(results['boxes'])\n        # assert score is descending order\n        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()\n"
  },
  {
    "path": "tests/test_ops/test_nms_quadri.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\n\nclass TestNMSQuadri:\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_ml_nms_quadri(self, device):\n        from mmcv.ops import nms_quadri\n        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],\n                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],\n                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],\n                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],\n                            dtype=np.float32)\n        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)\n\n        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],\n                                   [2., 2., 3., 4., 4., 2., 3., 1.],\n                                   [7., 7., 8., 8., 9., 7., 8., 6.]],\n                                  dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)\n\n        boxes = torch.from_numpy(np_boxes).to(device)\n        labels = torch.from_numpy(np_labels).to(device)\n\n        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3, labels)\n\n        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_nms_quadri(self, device):\n        from mmcv.ops import nms_quadri\n        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],\n                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],\n                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],\n                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],\n                            dtype=np.float32)\n\n        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],\n                                   [2., 2., 3., 4., 4., 2., 3., 1.],\n                                   [7., 7., 8., 8., 9., 7., 8., 6.]],\n                                  dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)\n\n        boxes = torch.from_numpy(np_boxes).to(device)\n\n        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3)\n        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n    @pytest.mark.parametrize('device', [\n        'cpu',\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_batched_nms(self, device):\n        # test batched_nms with nms_quadri\n        from mmcv.ops import batched_nms\n\n        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],\n                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],\n                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],\n                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],\n                            dtype=np.float32)\n        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)\n\n        np_expect_agnostic_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],\n                                            [2., 2., 3., 4., 4., 2., 3., 1.],\n                                            [7., 7., 8., 8., 9., 7., 8., 6.]],\n                                           dtype=np.float32)\n        np_expect_agnostic_keep_inds = np.array([3, 1, 2], dtype=np.int64)\n\n        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],\n                                   [2., 2., 3., 4., 4., 2., 3., 1.],\n                                   [1., 1., 3., 4., 4., 4., 4., 1.],\n                                   [7., 7., 8., 8., 9., 7., 8., 6.]],\n                                  dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)\n\n        nms_cfg = dict(type='nms_quadri', iou_threshold=0.3)\n\n        # test class_agnostic is True\n        boxes, keep = batched_nms(\n            torch.from_numpy(np_boxes[:, :8]).to(device),\n            torch.from_numpy(np_boxes[:, -1]).to(device),\n            torch.from_numpy(np_labels).to(device),\n            nms_cfg,\n            class_agnostic=True)\n        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_agnostic_dets)\n        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)\n\n        # test class_agnostic is False\n        boxes, keep = batched_nms(\n            torch.from_numpy(np_boxes[:, :8]).to(device),\n            torch.from_numpy(np_boxes[:, -1]).to(device),\n            torch.from_numpy(np_labels).to(device),\n            nms_cfg,\n            class_agnostic=False)\n        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_dets)\n        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)\n"
  },
  {
    "path": "tests/test_ops/test_nms_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\nclass TestNmsRotated:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_ml_nms_rotated(self, device):\n        from mmcv.ops import nms_rotated\n        np_boxes = np.array(\n            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],\n             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]\n             ],\n            dtype=np.float32)\n        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)\n\n        np_expect_dets = np.array(\n            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],\n             [6.0, 3.0, 8.0, 7.0, 0.5]],\n            dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)\n\n        boxes = torch.from_numpy(np_boxes).to(device)\n        labels = torch.from_numpy(np_labels).to(device)\n\n        # test cw angle definition\n        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)\n\n        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n        # test ccw angle definition\n        boxes[..., -2] *= -1\n        dets, keep_inds = nms_rotated(\n            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)\n        dets[..., -2] *= -1\n        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_nms_rotated(self, device):\n        from mmcv.ops import nms_rotated\n        np_boxes = np.array(\n            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],\n             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]\n             ],\n            dtype=np.float32)\n\n        np_expect_dets = np.array(\n            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],\n             [6.0, 3.0, 8.0, 7.0, 0.5]],\n            dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)\n\n        boxes = torch.from_numpy(np_boxes).to(device)\n\n        # test cw angle definition\n        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)\n        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n        # test ccw angle definition\n        boxes[..., -2] *= -1\n        dets, keep_inds = nms_rotated(\n            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)\n        dets[..., -2] *= -1\n        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)\n        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)\n\n    def test_batched_nms(self):\n        # test batched_nms with nms_rotated\n        from mmcv.ops import batched_nms\n\n        np_boxes = np.array(\n            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],\n             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]\n             ],\n            dtype=np.float32)\n        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)\n\n        np_expect_agnostic_dets = np.array(\n            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],\n             [6.0, 3.0, 8.0, 7.0, 0.5]],\n            dtype=np.float32)\n        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)\n\n        np_expect_dets = np.array(\n            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],\n             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],\n            dtype=np.float32)\n        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)\n\n        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)\n\n        # test class_agnostic is True\n        boxes, keep = batched_nms(\n            torch.from_numpy(np_boxes[:, :5]),\n            torch.from_numpy(np_boxes[:, -1]),\n            torch.from_numpy(np_labels),\n            nms_cfg,\n            class_agnostic=True)\n        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)\n        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)\n\n        # test class_agnostic is False\n        boxes, keep = batched_nms(\n            torch.from_numpy(np_boxes[:, :5]),\n            torch.from_numpy(np_boxes[:, -1]),\n            torch.from_numpy(np_labels),\n            nms_cfg,\n            class_agnostic=False)\n        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)\n        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)\n"
  },
  {
    "path": "tests/test_ops/test_onnx.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nimport numpy as np\nimport onnx\nimport pytest\nimport torch\nimport torch.nn as nn\n\nonnx_file = 'tmp.onnx'\nif torch.__version__ == 'parrots':\n    pytest.skip('not supported in parrots now', allow_module_level=True)\n\n\n@pytest.fixture(autouse=True)\ndef run_before_and_after_test():\n    # clear onnx_file before test\n    if os.path.exists(onnx_file):\n        os.remove(onnx_file)\n\n    yield\n\n    # clear onnx_file after test\n    if os.path.exists(onnx_file):\n        os.remove(onnx_file)\n\n\nclass WrapFunction(nn.Module):\n\n    def __init__(self, wrapped_function):\n        super().__init__()\n        self.wrapped_function = wrapped_function\n\n    def forward(self, *args, **kwargs):\n        return self.wrapped_function(*args, **kwargs)\n\n\ndef test_roialign():\n    rt = pytest.importorskip('onnxruntime')\n    try:\n        from mmcv.ops import roi_align\n    except (ImportError, ModuleNotFoundError):\n        pytest.skip('roi_align op is not successfully compiled')\n\n    # roi align config\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    sampling_ratio = 2\n\n    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),\n              ([[[[1., 2.], [3., 4.]], [[4., 3.],\n                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),\n              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],\n                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]\n\n    def warpped_function(torch_input, torch_rois):\n        return roi_align(torch_input, torch_rois, (pool_w, pool_h),\n                         spatial_scale, sampling_ratio, 'avg', True)\n\n    for case in inputs:\n        np_input = np.array(case[0], dtype=np.float32)\n        np_rois = np.array(case[1], dtype=np.float32)\n        input = torch.from_numpy(np_input)\n        rois = torch.from_numpy(np_rois)\n\n        # compute pytorch_output\n        with torch.no_grad():\n            pytorch_output = roi_align(input, rois, (pool_w, pool_h),\n                                       spatial_scale, sampling_ratio, 'avg',\n                                       True)\n\n        # export and load onnx model\n        wrapped_model = WrapFunction(warpped_function)\n        with torch.no_grad():\n            torch.onnx.export(\n                wrapped_model, (input, rois),\n                onnx_file,\n                export_params=True,\n                keep_initializers_as_inputs=True,\n                input_names=['input', 'rois'],\n                opset_version=11)\n\n        onnx_model = onnx.load(onnx_file)\n        session_options = rt.SessionOptions()\n\n        # compute onnx_output\n        input_all = [node.name for node in onnx_model.graph.input]\n        input_initializer = [\n            node.name for node in onnx_model.graph.initializer\n        ]\n        net_feed_input = list(set(input_all) - set(input_initializer))\n        assert (len(net_feed_input) == 2)\n        sess = rt.InferenceSession(\n            onnx_file, session_options, providers=['CPUExecutionProvider'])\n        onnx_output = sess.run(None, {\n            'input': input.detach().numpy(),\n            'rois': rois.detach().numpy()\n        })\n        onnx_output = onnx_output[0]\n\n        # allclose\n\n        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_roipool():\n    rt = pytest.importorskip('onnxruntime')\n    from mmcv.ops import roi_pool\n\n    # roi pool config\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n\n    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),\n              ([[[[1., 2.], [3., 4.]], [[4., 3.],\n                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),\n              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],\n                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]\n\n    def warpped_function(torch_input, torch_rois):\n        return roi_pool(torch_input, torch_rois, (pool_w, pool_h),\n                        spatial_scale)\n\n    for case in inputs:\n        np_input = np.array(case[0], dtype=np.float32)\n        np_rois = np.array(case[1], dtype=np.float32)\n        input = torch.from_numpy(np_input).cuda()\n        rois = torch.from_numpy(np_rois).cuda()\n\n        # compute pytorch_output\n        with torch.no_grad():\n            pytorch_output = roi_pool(input, rois, (pool_w, pool_h),\n                                      spatial_scale)\n            pytorch_output = pytorch_output.cpu()\n\n        # export and load onnx model\n        wrapped_model = WrapFunction(warpped_function)\n        with torch.no_grad():\n            torch.onnx.export(\n                wrapped_model, (input, rois),\n                onnx_file,\n                export_params=True,\n                keep_initializers_as_inputs=True,\n                input_names=['input', 'rois'],\n                opset_version=11)\n        onnx_model = onnx.load(onnx_file)\n\n        # compute onnx_output\n        input_all = [node.name for node in onnx_model.graph.input]\n        input_initializer = [\n            node.name for node in onnx_model.graph.initializer\n        ]\n        net_feed_input = list(set(input_all) - set(input_initializer))\n        assert (len(net_feed_input) == 2)\n        sess = rt.InferenceSession(\n            onnx_file, providers=['CPUExecutionProvider'])\n        onnx_output = sess.run(\n            None, {\n                'input': input.detach().cpu().numpy(),\n                'rois': rois.detach().cpu().numpy()\n            })\n        onnx_output = onnx_output[0]\n\n        # allclose\n        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)\n\n\ndef _test_symbolic(model, inputs, symbol_name):\n    with torch.no_grad():\n        torch.onnx.export(model, inputs, onnx_file, opset_version=11)\n\n    import onnx\n    model = onnx.load(onnx_file)\n    nodes = model.graph.node\n\n    symbol_exist = False\n    for n in nodes:\n        if n.op_type == symbol_name:\n            symbol_exist = True\n    assert symbol_exist\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_border_align():\n    from mmcv.ops import BorderAlign\n    model = BorderAlign(2)\n    input = torch.rand(1, 8, 2, 2).cuda()\n    boxes = torch.rand(1, 4, 4).cuda()\n    _test_symbolic(model, (input, boxes), 'MMCVBorderAlign')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_carafe():\n    from mmcv.ops import CARAFENaive\n    feat = torch.randn(2, 64, 3, 3, device='cuda').double()\n    mask = torch.randn(2, 100, 6, 6, device='cuda').sigmoid().double()\n    _test_symbolic(CARAFENaive(5, 4, 2), (feat, mask), 'MMCVCARAFENaive')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_deform_conv():\n    from mmcv.ops import DeformConv2dPack\n    x = torch.randn(1, 2, 4, 4, device='cuda')\n    _test_symbolic(\n        DeformConv2dPack(2, 4, 3, 1, 1).cuda(), (x, ), 'MMCVDeformConv2d')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_modulated_deform_conv():\n    from mmcv.ops import ModulatedDeformConv2dPack\n    x = torch.randn(1, 2, 4, 4, device='cuda')\n    _test_symbolic(\n        ModulatedDeformConv2dPack(2, 4, 3, 1, 1).cuda(), x,\n        'MMCVModulatedDeformConv2d')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_deform_roi_pool():\n    from mmcv.ops import DeformRoIPoolPack\n    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')\n    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')\n    output_c = x.size(1)\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    sampling_ratio = 2\n    model = DeformRoIPoolPack((pool_h, pool_w),\n                              output_c,\n                              spatial_scale=spatial_scale,\n                              sampling_ratio=sampling_ratio).cuda()\n\n    _test_symbolic(model, (x, rois), 'MMCVDeformRoIPool')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_masked_conv():\n    from mmcv.ops import MaskedConv2d\n    x = torch.rand(1, 2, 4, 4, device='cuda')\n    mask = torch.rand(1, 4, 4, device='cuda')\n    _test_symbolic(\n        MaskedConv2d(2, 4, 3, 1, 1).cuda(), (x, mask), 'MMCVMaskedConv2d')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_pr_roi_pool():\n    from mmcv.ops import PrRoIPool\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')\n    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')\n    model = PrRoIPool((pool_h, pool_w), spatial_scale).cuda()\n    _test_symbolic(model, (x, rois), 'PrRoIPool')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_psa_mask():\n    from mmcv.ops import PSAMask\n    input = torch.rand(4, 16, 8, 8).cuda()\n    model = PSAMask('collect', (4, 4)).cuda()\n    _test_symbolic(model, input, 'MMCVPSAMask')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_roi_align_rotated():\n    from mmcv.ops import RoIAlignRotated\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    sampling_ratio = 2\n    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')\n    rois = torch.tensor([[0., 0.5, 0.5, 1., 1., 0]], device='cuda')\n    model = RoIAlignRotated((pool_h, pool_w), spatial_scale,\n                            sampling_ratio).cuda()\n    _test_symbolic(model, (x, rois), 'MMCVRoIAlignRotated')\n\n\n@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')\ndef test_roi_feaeture_align():\n    from mmcv.ops import rotated_feature_align\n    wrapped_model = WrapFunction(rotated_feature_align)\n    feature = torch.rand(1, 1, 2, 2, device='cuda')\n    bbox = torch.rand(1, 2, 2, 5, device='cuda')\n    _test_symbolic(wrapped_model, (feature, bbox), 'MMCVRotatedFeatureAlign')\n"
  },
  {
    "path": "tests/test_ops/test_pixel_group.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport torch\n\n\ndef test_pixel_group():\n    from mmcv.ops import pixel_group\n    np_score = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],\n                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],\n                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],\n                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],\n                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).astype(np.float32)\n    np_mask = (np_score > 0.5)\n    np_embedding = np.zeros((10, 10, 8)).astype(np.float32)\n    np_embedding[:, :7] = 0.9\n    np_embedding[:, 7:] = 10.0\n    np_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],\n                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],\n                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],\n                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                [0, 0, 0, 0, 0, 0, 0, 0, 0,\n                                 0]]).astype(np.int32)\n    np_kernel_contour = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],\n                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],\n                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],\n                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],\n                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,\n                                   0]]).astype(np.uint8)\n    kernel_region_num = 3\n    distance_threshold = float(0.8)\n    result = pixel_group(np_score, np_mask, np_embedding, np_kernel_label,\n                         np_kernel_contour, kernel_region_num,\n                         distance_threshold)\n    gt_1 = [\n        0.8999997973442078, 24.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 5.0,\n        3.0, 6.0, 3.0, 1.0, 4.0, 2.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 4.0, 6.0,\n        4.0, 1.0, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 6.0, 5.0, 1.0,\n        6.0, 2.0, 6.0, 3.0, 6.0, 4.0, 6.0, 5.0, 6.0, 6.0, 6.0\n    ]\n\n    gt_2 = [\n        0.9000000357627869, 8.0, 7.0, 3.0, 8.0, 3.0, 7.0, 4.0, 8.0, 4.0, 7.0,\n        5.0, 8.0, 5.0, 7.0, 6.0, 8.0, 6.0\n    ]\n\n    assert np.allclose(result[0], [0, 0])\n    assert np.allclose(result[1], gt_1)\n    assert np.allclose(result[2], gt_2)\n\n    # test torch Tensor\n    np_score_t = torch.from_numpy(np_score)\n    np_mask_t = torch.from_numpy(np_mask)\n    np_embedding_t = torch.from_numpy(np_embedding)\n    np_kernel_label_t = torch.from_numpy(np_kernel_label)\n    np_kernel_contour_t = torch.from_numpy(np_kernel_contour)\n\n    result = pixel_group(np_score_t, np_mask_t, np_embedding_t,\n                         np_kernel_label_t, np_kernel_contour_t,\n                         kernel_region_num, distance_threshold)\n\n    assert np.allclose(result[0], [0, 0])\n    assert np.allclose(result[1], gt_1)\n    assert np.allclose(result[2], gt_2)\n"
  },
  {
    "path": "tests/test_ops/test_points_in_polygons.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import points_in_polygons\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_points_in_polygons(device):\n    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],\n                       [100, 0]])\n    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],\n                         [400., 400., 500., 500., 600., 300., 500., 200.],\n                         [300., 300., 600., 700., 700., 700., 700., 100.]])\n    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],\n                                [1., 0., 0.], [0., 0., 0.]]).astype(np.float32)\n    points = torch.tensor(points, dtype=torch.float32, device=device)\n    polygons = torch.tensor(polygons, dtype=torch.float32, device=device)\n    assert np.allclose(\n        points_in_polygons(points, polygons).cpu().numpy(), expected_output,\n        1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_prroi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n\n    _USING_PARROTS = False\n\ninputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,\n                                               1.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],\n              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]\noutputs = [\n    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],\n                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),\n    ([[[[1.75, 2.25], [2.75, 3.25]],\n       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],\n                                         [[1., 1.],\n                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),\n    ([[[[3.75, 6.91666651],\n        [10.08333302,\n         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],\n                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],\n                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],\n                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],\n     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])\n]\n\n\nclass TestPrRoiPool:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_roipool_gradcheck(self, device):\n        from mmcv.ops import PrRoIPool\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n\n        for case in inputs:\n            np_input = np.array(case[0], dtype=np.float32)\n            np_rois = np.array(case[1], dtype=np.float32)\n\n            x = torch.tensor(np_input, device=device, requires_grad=True)\n            rois = torch.tensor(np_rois, device=device)\n\n            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)\n\n            if _USING_PARROTS:\n                gradcheck(froipool, (x, rois), no_grads=[rois])\n            else:\n                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)\n\n    def _test_roipool_allclose(self, device, dtype=torch.float):\n        from mmcv.ops import prroi_pool\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n\n        for case, output in zip(inputs, outputs):\n            np_input = np.array(case[0], dtype=np.float32)\n            np_rois = np.array(case[1], dtype=np.float32)\n            np_output = np.array(output[0], dtype=np.float32)\n            np_input_grad = np.array(output[1], dtype=np.float32)\n            np_rois_grad = np.array(output[2], dtype=np.float32)\n\n            x = torch.tensor(\n                np_input, dtype=dtype, device=device, requires_grad=True)\n            rois = torch.tensor(\n                np_rois, dtype=dtype, device=device, requires_grad=True)\n\n            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)\n            output.backward(torch.ones_like(output))\n            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)\n            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)\n            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,\n                               1e-3)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_roipool_allclose_float(self, device):\n        self._test_roipool_allclose(device, dtype=torch.float)\n"
  },
  {
    "path": "tests/test_ops/test_psa_mask.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\nclass Loss(nn.Module):\n\n    def __init__(self):\n        super().__init__()\n\n    def forward(self, input, target):\n        input = input.view(-1)\n        target = target.view(-1)\n        return torch.mean(input - target)\n\n\nclass TestPSAMask:\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_psa_mask_collect(self, device):\n        from mmcv.ops import PSAMask\n        test_loss = Loss()\n\n        input = np.fromfile(\n            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)\n        output_collect = np.fromfile(\n            'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)\n\n        input = input.reshape((4, 16, 8, 8))\n        output_collect = output_collect.reshape((4, 64, 8, 8))\n        label = torch.ones((4, 64, 8, 8))\n\n        input = torch.FloatTensor(input)\n        input.requires_grad = True\n\n        psamask_collect = PSAMask('collect', (4, 4))\n\n        # test collect cpu\n        test_output = psamask_collect(input)\n        loss = test_loss(test_output, label)\n        loss.backward()\n        test_output = test_output.detach().numpy()\n        assert np.allclose(test_output, output_collect)\n        assert test_output.shape == output_collect.shape\n\n        psamask_collect.to(device)\n        input = input.to(device)\n        label = label.to(device)\n\n        # test collect on device\n        test_output = psamask_collect(input)\n        loss = test_loss(test_output, label)\n        loss.backward()\n        test_output = test_output.detach().cpu().numpy()\n        assert np.allclose(test_output, output_collect)\n        assert test_output.shape == output_collect.shape\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def test_psa_mask_distribute(self, device):\n        from mmcv.ops import PSAMask\n        test_loss = Loss()\n\n        input = np.fromfile(\n            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)\n        output_distribute = np.fromfile(\n            'tests/data/for_psa_mask/psa_output_distribute.bin',\n            dtype=np.float32)\n\n        input = input.reshape((4, 16, 8, 8))\n        output_distribute = output_distribute.reshape((4, 64, 8, 8))\n        label = torch.ones((4, 64, 8, 8))\n\n        input = torch.FloatTensor(input)\n        input.requires_grad = True\n\n        psamask_distribute = PSAMask('distribute', (4, 4))\n\n        # test distribute cpu\n        test_output = psamask_distribute(input)\n        loss = test_loss(test_output, label)\n        loss.backward()\n        test_output = test_output.detach().numpy()\n        assert np.allclose(test_output, output_distribute)\n        assert test_output.shape == output_distribute.shape\n\n        psamask_distribute.to(device)\n        input = input.to(device)\n        label = label.to(device)\n\n        # test distribute on device\n        test_output = psamask_distribute(input)\n        loss = test_loss(test_output, label)\n        loss.backward()\n        test_output = test_output.detach().cpu().numpy()\n        assert np.allclose(test_output, output_distribute)\n        assert test_output.shape == output_distribute.shape\n"
  },
  {
    "path": "tests/test_ops/test_riroi_align_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import RiRoIAlignRotated\n\nif torch.__version__ == 'parrots':\n    from parrots.autograd import gradcheck\n    _USING_PARROTS = True\nelse:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n\nnp_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],\n                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,\n                                                                       14]],\n                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])\nnp_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],\n                    [0., 1., 1., 3., 3., np.pi / 2]])\nexpect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],\n                           [[2.4779, 1.7416], [3.2173, 2.5632]],\n                           [[2.7149, 2.2638], [2.6540, 2.3673]],\n                           [[2.9461, 2.8638], [2.8028, 2.7205]],\n                           [[4.1943, 2.7214], [5.6119, 4.1391]],\n                           [[7.5276, 6.0547], [8.9453, 7.4724]],\n                           [[12.1943, 10.7214], [13.6119, 12.1391]],\n                           [[9.5489, 8.4237], [10.5763, 9.4511]]],\n                          [[[7.6562, 12.5625], [4.0000, 6.6250]],\n                           [[1.0000, 1.3125], [0.5000, 0.6562]],\n                           [[1.6562, 1.9375], [1.0000, 1.3125]],\n                           [[1.8438, 2.0547], [0.7500, 1.1562]],\n                           [[0.8438, 3.0625], [0.2500, 1.1875]],\n                           [[2.6562, 2.5625], [1.5000, 1.6250]],\n                           [[3.6562, 4.5625], [2.0000, 2.6250]],\n                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])\n\nexpect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]],\n                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])\n\npool_h = 2\npool_w = 2\nspatial_scale = 1.0\nnum_samples = 2\nsampling_ratio = 2\nnum_orientations = 8\nclockwise = False\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\ndef test_roialign_rotated_gradcheck():\n    x = torch.tensor(\n        np_feature, dtype=torch.float, device='cuda', requires_grad=True)\n    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')\n    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,\n                                 num_orientations, clockwise)\n    if _USING_PARROTS:\n        gradcheck(\n            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)\n    else:\n        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\ndef test_roialign_rotated_allclose():\n    x = torch.tensor(\n        np_feature, dtype=torch.float, device='cuda', requires_grad=True)\n    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')\n    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,\n                                 num_orientations, clockwise)\n    output = froipool(x, rois)\n    output.backward(torch.ones_like(output))\n    assert np.allclose(\n        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)\n    assert np.allclose(\n        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)\n"
  },
  {
    "path": "tests/test_ops/test_roi_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n\n# yapf:disable\n\ninputs = [([[[[1., 2.], [3., 4.]]]],\n           [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2.], [3., 4.]],\n             [[4., 3.], [2., 1.]]]],\n           [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],\n              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],\n           [[0., 0., 0., 3., 3.]])]\noutputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],\n            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),\n           ([[[[1.0, 1.25], [1.5, 1.75]],\n              [[4.0, 3.75], [3.5, 3.25]]]],\n            [[[[3.0625, 0.4375], [0.4375, 0.0625]],\n              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),\n           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],\n            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]\n# yapf:enable\n\npool_h = 2\npool_w = 2\nspatial_scale = 1.0\nsampling_ratio = 2\n\n\ndef _test_roialign_gradcheck(device, dtype):\n    try:\n        from mmcv.ops import RoIAlign\n    except ModuleNotFoundError:\n        pytest.skip('RoIAlign op is not successfully compiled')\n    if dtype is torch.half:\n        pytest.skip('grad check does not support fp16')\n    for case in inputs:\n        np_input = np.array(case[0])\n        np_rois = np.array(case[1])\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        rois = torch.tensor(np_rois, dtype=dtype, device=device)\n\n        froipool = RoIAlign((pool_h, pool_w), spatial_scale, sampling_ratio)\n\n        if torch.__version__ == 'parrots':\n            gradcheck(\n                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)\n        else:\n            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)\n\n\ndef _test_roialign_allclose(device, dtype):\n    try:\n        from mmcv.ops import roi_align\n    except ModuleNotFoundError:\n        pytest.skip('test requires compilation')\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    sampling_ratio = 2\n    for case, output in zip(inputs, outputs):\n        np_input = np.array(case[0])\n        np_rois = np.array(case[1])\n        np_output = np.array(output[0])\n        np_grad = np.array(output[1])\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        rois = torch.tensor(np_rois, dtype=dtype, device=device)\n\n        output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,\n                           sampling_ratio, 'avg', True)\n        output.backward(torch.ones_like(output))\n        assert np.allclose(\n            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)\n        assert np.allclose(\n            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)\n\n\n@pytest.mark.parametrize('dtype', [torch.float, torch.half])\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_roialign_float(device, dtype):\n    _test_roialign_allclose(device=device, dtype=dtype)\n\n\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n])\ndef test_roialign_float64(device):\n    _test_roialign_allclose(device=device, dtype=torch.double)\n    _test_roialign_gradcheck(device=device, dtype=torch.double)\n"
  },
  {
    "path": "tests/test_ops/test_roi_align_rotated.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n    _USING_PARROTS = False\n# yapf:disable\ninputs = [([[[[1., 2.], [3., 4.]]]],\n           [[0., 0.5, 0.5, 1., 1., 0]]),\n          ([[[[1., 2.], [3., 4.]]]],\n           [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),\n          ([[[[1., 2.], [3., 4.]],\n             [[4., 3.], [2., 1.]]]],\n           [[0., 0.5, 0.5, 1., 1., 0]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],\n              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],\n           [[0., 1.5, 1.5, 3., 3., 0]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],\n              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],\n           [[0., 1.5, 1.5, 3., 3., np.pi / 2]])]\noutputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],\n            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),\n           ([[[[1.5, 1], [1.75, 1.25]]]],\n            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),\n           ([[[[1.0, 1.25], [1.5, 1.75]],\n              [[4.0, 3.75], [3.5, 3.25]]]],\n            [[[[3.0625, 0.4375], [0.4375, 0.0625]],\n              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),\n           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],\n            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]]),\n           ([[[[7.5625, 1.9375], [10.375, 4.75]]]],\n            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.42968750, 0.39062500, 0.39062500, 0.03906250],\n               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]\n# yapf:enable\n\npool_h = 2\npool_w = 2\nspatial_scale = 1.0\nsampling_ratio = 2\n\n\ndef _test_roialign_rotated_gradcheck(device, dtype):\n    try:\n        from mmcv.ops import RoIAlignRotated\n    except ModuleNotFoundError:\n        pytest.skip('RoIAlignRotated op is not successfully compiled')\n    if dtype is torch.half:\n        pytest.skip('grad check does not support fp16')\n    for case in inputs:\n        np_input = np.array(case[0])\n        np_rois = np.array(case[1])\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        rois = torch.tensor(np_rois, dtype=dtype, device=device)\n\n        froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,\n                                   sampling_ratio)\n        if torch.__version__ == 'parrots':\n            gradcheck(\n                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)\n        else:\n            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)\n\n\ndef _test_roialign_rotated_allclose(device, dtype):\n    try:\n        from mmcv.ops import RoIAlignRotated, roi_align_rotated\n    except ModuleNotFoundError:\n        pytest.skip('test requires compilation')\n    pool_h = 2\n    pool_w = 2\n    spatial_scale = 1.0\n    sampling_ratio = 2\n\n    for case, output in zip(inputs, outputs):\n        np_input = np.array(case[0])\n        np_rois = np.array(case[1])\n        np_output = np.array(output[0])\n        np_grad = np.array(output[1])\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        rois = torch.tensor(np_rois, dtype=dtype, device=device)\n\n        output = roi_align_rotated(x, rois, (pool_h, pool_w), spatial_scale,\n                                   sampling_ratio, True)\n        output.backward(torch.ones_like(output))\n        assert np.allclose(\n            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)\n        assert np.allclose(\n            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)\n\n    # Test deprecated parameters\n    roi_align_rotated_module_deprecated = RoIAlignRotated(\n        out_size=(pool_h, pool_w),\n        spatial_scale=spatial_scale,\n        sample_num=sampling_ratio)\n\n    output_1 = roi_align_rotated_module_deprecated(x, rois)\n\n    roi_align_rotated_module_new = RoIAlignRotated(\n        output_size=(pool_h, pool_w),\n        spatial_scale=spatial_scale,\n        sampling_ratio=sampling_ratio)\n\n    output_2 = roi_align_rotated_module_new(x, rois)\n\n    assert np.allclose(\n        output_1.data.type(torch.float).cpu().numpy(),\n        output_2.data.type(torch.float).cpu().numpy())\n\n\n@pytest.mark.parametrize('device', [\n    'cpu',\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\n@pytest.mark.parametrize('dtype', [\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='MLU, MUSA does not support for 64-bit floating point')),\n    torch.half\n])\ndef test_roialign_rotated(device, dtype):\n    # check double only\n    if dtype is torch.double:\n        _test_roialign_rotated_gradcheck(device=device, dtype=dtype)\n    _test_roialign_rotated_allclose(device=device, dtype=dtype)\n"
  },
  {
    "path": "tests/test_ops/test_roi_pool.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n\n    _USING_PARROTS = False\n\ncur_dir = os.path.dirname(os.path.abspath(__file__))\n\ninputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,\n                                               1.]]]], [[0., 0., 0., 1., 1.]]),\n          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],\n              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]\noutputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),\n           ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],\n                                                               [1., 1.]],\n                                                              [[1., 1.],\n                                                               [1., 1.]]]]),\n           ([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],\n                                           [0., 0., 0., 0.], [0., 1., 0.,\n                                                              1.]]]])]\n\n\nclass TestRoiPool:\n\n    def test_roipool_gradcheck(self):\n        if not torch.cuda.is_available():\n            return\n        from mmcv.ops import RoIPool\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n\n        for case in inputs:\n            np_input = np.array(case[0])\n            np_rois = np.array(case[1])\n\n            x = torch.tensor(np_input, device='cuda', requires_grad=True)\n            rois = torch.tensor(np_rois, device='cuda')\n\n            froipool = RoIPool((pool_h, pool_w), spatial_scale)\n\n            if _USING_PARROTS:\n                pass\n                # gradcheck(froipool, (x, rois), no_grads=[rois])\n            else:\n                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)\n\n    def _test_roipool_allclose(self, device, dtype=torch.float):\n        from mmcv.ops import roi_pool\n        pool_h = 2\n        pool_w = 2\n        spatial_scale = 1.0\n\n        for case, output in zip(inputs, outputs):\n            np_input = np.array(case[0])\n            np_rois = np.array(case[1])\n            np_output = np.array(output[0])\n            np_grad = np.array(output[1])\n\n            x = torch.tensor(\n                np_input, dtype=dtype, device=device, requires_grad=True)\n            rois = torch.tensor(np_rois, dtype=dtype, device=device)\n            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)\n            output.backward(torch.ones_like(output))\n            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)\n            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support')),\n        pytest.param(\n            'npu',\n            marks=pytest.mark.skipif(\n                not IS_NPU_AVAILABLE, reason='requires NPU support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n    ])\n    @pytest.mark.parametrize('dtype', [\n        torch.float,\n        pytest.param(\n            torch.double,\n            marks=pytest.mark.skipif(\n                IS_MLU_AVAILABLE or IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,\n                reason='MLU, NPU, MUSA '\n                'does not support for 64-bit floating point')), torch.half\n    ])\n    def test_roipool_allclose(self, device, dtype):\n        self._test_roipool_allclose(device, dtype)\n"
  },
  {
    "path": "tests/test_ops/test_roiaware_pool3d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,\n                      points_in_boxes_part)\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\n@pytest.mark.parametrize('dtype', [\n    torch.float, torch.half,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='MLU, MUSA does not support for double'))\n])\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_RoIAwarePool3d(device, dtype):\n    roiaware_pool3d_max = RoIAwarePool3d(\n        out_size=4, max_pts_per_voxel=128, mode='max')\n    roiaware_pool3d_avg = RoIAwarePool3d(\n        out_size=4, max_pts_per_voxel=128, mode='avg')\n    rois = torch.tensor(\n        [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],\n         [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],\n        dtype=dtype).to(device)\n    # boxes (m, 7) with bottom center in lidar coordinate\n    pts = torch.tensor(\n        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],\n         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],\n         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],\n         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],\n        dtype=dtype).to(device)  # points (n, 3) in lidar coordinate\n    pts_feature = pts.clone()\n\n    pooled_features_max = roiaware_pool3d_max(\n        rois=rois, pts=pts, pts_feature=pts_feature)\n    assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])\n    assert torch.allclose(pooled_features_max.sum(),\n                          torch.tensor(51.100, dtype=dtype).to(device), 1e-3)\n\n    pooled_features_avg = roiaware_pool3d_avg(\n        rois=rois, pts=pts, pts_feature=pts_feature)\n    assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])\n    assert torch.allclose(pooled_features_avg.sum(),\n                          torch.tensor(49.750, dtype=dtype).to(device), 1e-3)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_points_in_boxes_part(device):\n    boxes = torch.tensor(\n        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],\n         [[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],\n        dtype=torch.float32).to(\n            device)  # boxes (b, t, 7) with bottom center in lidar coordinate\n    pts = torch.tensor(\n        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],\n          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],\n          [4.7, 3.5, -12.2]],\n         [[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],\n          [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],\n        dtype=torch.float32).to(device)  # points (b, m, 3) in lidar coordinate\n\n    point_indices = points_in_boxes_part(points=pts, boxes=boxes)\n    expected_point_indices = torch.tensor(\n        [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],\n        dtype=torch.int32).to(device)\n    assert point_indices.shape == torch.Size([2, 8])\n    assert (point_indices == expected_point_indices).all()\n\n    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],\n                         dtype=torch.float32).to(device)  # 30 degrees\n    pts = torch.tensor(\n        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],\n          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],\n        dtype=torch.float32).to(device)\n    point_indices = points_in_boxes_part(points=pts, boxes=boxes)\n    expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],\n                                          dtype=torch.int32).to(device)\n    assert (point_indices == expected_point_indices).all()\n\n\ndef test_points_in_boxes_cpu():\n    boxes = torch.tensor(\n        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],\n          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],\n        dtype=torch.float32\n    )  # boxes (m, 7) with bottom center in lidar coordinate\n    pts = torch.tensor(\n        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],\n          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],\n          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [\n              -16, -18, 9\n          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],\n        dtype=torch.float32)  # points (n, 3) in lidar coordinate\n\n    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)\n    expected_point_indices = torch.tensor(\n        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],\n          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],\n        dtype=torch.int32)\n    assert point_indices.shape == torch.Size([1, 15, 2])\n    assert (point_indices == expected_point_indices).all()\n\n    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],\n                         dtype=torch.float32)  # 30 degrees\n    pts = torch.tensor(\n        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],\n          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],\n        dtype=torch.float32)\n    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)\n    expected_point_indices = torch.tensor(\n        [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)\n    assert (point_indices == expected_point_indices).all()\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\ndef test_points_in_boxes_all():\n\n    boxes = torch.tensor(\n        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],\n          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],\n        dtype=torch.float32).cuda(\n        )  # boxes (m, 7) with bottom center in lidar coordinate\n    pts = torch.tensor(\n        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],\n          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],\n          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [\n              -16, -18, 9\n          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],\n        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate\n\n    point_indices = points_in_boxes_all(points=pts, boxes=boxes)\n    expected_point_indices = torch.tensor(\n        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],\n          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],\n        dtype=torch.int32).cuda()\n    assert point_indices.shape == torch.Size([1, 15, 2])\n    assert (point_indices == expected_point_indices).all()\n\n    if torch.cuda.device_count() > 1:\n        pts = pts.to('cuda:1')\n        boxes = boxes.to('cuda:1')\n        expected_point_indices = expected_point_indices.to('cuda:1')\n        point_indices = points_in_boxes_all(points=pts, boxes=boxes)\n        assert point_indices.shape == torch.Size([1, 15, 2])\n        assert (point_indices == expected_point_indices).all()\n"
  },
  {
    "path": "tests/test_ops/test_roipoint_pool3d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import RoIPointPool3d\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\n@pytest.mark.parametrize('dtype', [\n    torch.float, torch.half,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MLU_AVAILABLE or IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='MLU, NPU, MUSA does not support for double'))\n])\ndef test_roipoint(device, dtype):\n    points = torch.tensor(\n        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],\n         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],\n         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],\n         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],\n        dtype=dtype).unsqueeze(0).to(device)\n    feats = points.clone()\n    rois = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],\n                          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],\n                        dtype=dtype).to(device)\n\n    roipoint_pool3d = RoIPointPool3d(num_sampled_points=4)\n    roi_feat, empty_flag = roipoint_pool3d(points, feats, rois)\n    expected_roi_feat = torch.tensor(\n        [[[[1, 2, 3.3, 1, 2, 3.3], [1.2, 2.5, 3, 1.2, 2.5, 3],\n           [0.8, 2.1, 3.5, 0.8, 2.1, 3.5], [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],\n          [[-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2],\n           [-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2]]]\n         ],\n        dtype=dtype).to(device)\n    expected_empty_flag = torch.tensor([[0, 0]]).int().to(device)\n\n    assert torch.allclose(roi_feat, expected_roi_feat)\n    assert torch.allclose(empty_flag, expected_empty_flag)\n"
  },
  {
    "path": "tests/test_ops/test_rotated_feature_align.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import rotated_feature_align\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\n@pytest.mark.skipif(\n    not torch.cuda.is_available(), reason='requires CUDA support')\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n    pytest.param(\n        'cpu',\n        marks=pytest.mark.skipif(\n            torch.__version__ == 'parrots', reason='requires PyTorch support'))\n])\ndef test_rotated_feature_align(device):\n    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],\n                              [0.9144, 1.2248, 1.3115, -0.9690],\n                              [-0.8949, -1.1797, -0.9093, -0.3961],\n                              [-0.4586, 0.5062, -0.7947, -0.7397]],\n                             [[-1.0943, -0.7495, 1.3461, -1.1652],\n                              [0.2034, 0.6763, -1.2357, 0.5231],\n                              [-1.0062, 1.2592, 1.4225, -0.3951],\n                              [-0.1242, -1.6240, 0.1932, 2.7181]],\n                             [[-1.6271, -1.0276, 0.0578, -0.2997],\n                              [-0.9684, -1.6946, -1.3188, -1.1938],\n                              [-1.6744, -0.8917, -0.6556,\n                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],\n                            [[[0.7035, 0.2089, -0.1774, 3.4670],\n                              [-0.8505, -0.9278, 1.4714, 0.1644],\n                              [0.0898, 0.3531, -0.4007, 0.1927],\n                              [1.2569, -0.2636, -0.5223, 0.0616]],\n                             [[0.1760, -0.7639, -0.4600, -1.3260],\n                              [-0.9921, -0.2970, -0.8955, 1.0508],\n                              [1.3515, -0.1641, 1.9679, 1.1986],\n                              [-0.3616, 0.6287, 0.4933, 0.3360]],\n                             [[-0.5860, 0.2124, -0.8700, 2.4200],\n                              [-0.0551, -1.5103, -1.6779, 0.8399],\n                              [0.8431, 1.2414, -1.1243, -0.3887],\n                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],\n                           device=device,\n                           requires_grad=True)\n\n    bbox = torch.tensor(\n        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],\n           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],\n           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],\n           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],\n          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],\n           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],\n           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],\n           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],\n          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],\n           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],\n           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],\n           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],\n          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],\n           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],\n           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],\n           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],\n         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],\n           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],\n           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],\n           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],\n          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],\n           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],\n           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],\n           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],\n          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],\n           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],\n           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],\n           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],\n          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],\n           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],\n           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],\n           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],\n        device=device,\n        requires_grad=True)\n\n    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],\n                                      [0.9144, 0.7662, 1.0487, -0.9690],\n                                      [-0.8949, -1.6384, -0.9093, -0.3961],\n                                      [-0.8604, 0.5062, -0.7947, -0.7397]],\n                                     [[-0.3961, -0.7495, 1.3461, 1.5528],\n                                      [0.2034, 0.5522, -1.6722, 0.5231],\n                                      [-1.0062, 1.1350, 1.4225, -0.3951],\n                                      [-0.4826, -1.6240, 0.1932, 2.7181]],\n                                     [[-2.6436, -1.0276, 0.0578, -0.8344],\n                                      [-0.9684, -1.8151, -2.1843, -1.1938],\n                                      [-1.6744, -1.0121, -0.6556, 1.0073],\n                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],\n                                    [[[0.7035, 0.2089, -0.1774, 3.4670],\n                                      [-0.8505, -0.9278, 1.4714, 0.1644],\n                                      [0.0898, 0.3064, -0.4007, 0.5849],\n                                      [1.2569, -0.2636, -0.5223, 0.0616]],\n                                     [[0.1760, -0.7639, -0.4600, -1.3260],\n                                      [-0.9921, -0.2970, -0.8955, 1.0508],\n                                      [1.3515, -0.6125, 1.9679, 0.5550],\n                                      [-0.3616, 0.6287, 0.4933, 0.3360]],\n                                     [[-0.5860, 0.2124, -0.8700, 2.4200],\n                                      [-0.0551, -1.5103, -1.6779, 0.8399],\n                                      [0.8431, 0.8455, -1.1243, -1.5994],\n                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],\n                                   device=device)\n\n    expected_grad = torch.tensor([\n        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],\n          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],\n         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],\n          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],\n         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],\n          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],\n        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],\n          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],\n         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],\n          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],\n         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],\n          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]\n    ],\n                                 device=device)\n\n    output = rotated_feature_align(\n        feature, bbox, spatial_scale=1 / 8, points=1)\n    output.backward(torch.ones_like(output))\n    assert torch.allclose(output, expected_output, 1e-2)\n    assert torch.allclose(feature.grad, expected_grad, 1e-2)\n"
  },
  {
    "path": "tests/test_ops/test_saconv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport torch\nimport torch.nn as nn\n\nfrom mmcv.ops import SAConv2d\n\n\ndef test_sacconv():\n\n    # test with normal cast\n    x = torch.rand(1, 3, 256, 256)\n    saconv = SAConv2d(3, 5, kernel_size=3, padding=1)\n    sac_out = saconv(x)\n    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)\n    refer_out = refer_conv(x)\n    assert sac_out.shape == refer_out.shape\n\n    # test with dilation >= 2\n    dalited_saconv = SAConv2d(3, 5, kernel_size=3, padding=2, dilation=2)\n    dalited_sac_out = dalited_saconv(x)\n    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=2, dilation=2)\n    refer_out = refer_conv(x)\n    assert dalited_sac_out.shape == refer_out.shape\n\n    # test with deform\n    deform_saconv = SAConv2d(3, 5, kernel_size=3, padding=1, use_deform=True)\n    if torch.cuda.is_available():\n        x = torch.rand(1, 3, 256, 256).cuda()\n        deform_saconv = SAConv2d(\n            3, 5, kernel_size=3, padding=1, use_deform=True).cuda()\n        deform_sac_out = deform_saconv(x).cuda()\n        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1).cuda()\n        refer_out = refer_conv(x)\n        assert deform_sac_out.shape == refer_out.shape\n    else:\n        deform_sac_out = deform_saconv(x)\n        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)\n        refer_out = refer_conv(x)\n        assert deform_sac_out.shape == refer_out.shape\n\n    # test with groups >= 2\n    x = torch.rand(1, 4, 256, 256)\n    group_saconv = SAConv2d(4, 4, kernel_size=3, padding=1, groups=2)\n    group_sac_out = group_saconv(x)\n    refer_conv = nn.Conv2d(4, 4, kernel_size=3, padding=1, groups=2)\n    refer_out = refer_conv(x)\n    assert group_sac_out.shape == refer_out.shape\n"
  },
  {
    "path": "tests/test_ops/test_scatter_points.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nfrom torch.autograd import gradcheck\n\nfrom mmcv.ops import DynamicScatter\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nif torch.__version__ == 'parrots':\n    pytest.skip('not supported in parrots now', allow_module_level=True)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_dynamic_scatter(device):\n    dsmean = DynamicScatter([0.32, 0.32, 6],\n                            [-74.88, -74.88, -2, 74.88, 74.88, 4], True)\n    dsmax = DynamicScatter([0.32, 0.32, 6],\n                           [-74.88, -74.88, -2, 74.88, 74.88, 4], False)\n\n    # test empty input\n    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device=device)\n    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device=device)\n\n    empty_feats.requires_grad_()\n    empty_feats_out_mean, empty_coors_out_mean = dsmean(\n        empty_feats, empty_coors)\n    empty_feats_out_mean.sum().backward()\n    empty_feats_out_max, empty_coors_out_max = dsmax(empty_feats, empty_coors)\n    empty_feats_out_max.sum().backward()\n\n    assert empty_feats_out_mean.shape == empty_feats.shape\n    assert empty_feats_out_max.shape == empty_feats.shape\n    assert empty_coors_out_mean.shape == empty_coors.shape\n    assert empty_coors_out_max.shape == empty_coors.shape\n\n    # test empty reduced output\n    empty_o_feats = torch.rand(\n        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50\n    empty_o_coors = torch.randint(\n        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device=device)\n\n    empty_o_feats.requires_grad_()\n    empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(\n        empty_o_feats, empty_o_coors)\n    empty_o_feats_out_mean.sum().backward()\n    assert (empty_o_feats.grad == 0).all()\n\n    empty_o_feats_out_max, empty_o_coors_out_max = dsmax(\n        empty_o_feats, empty_o_coors)\n    empty_o_feats_out_max.sum().backward()\n    assert (empty_o_feats.grad == 0).all()\n\n    # test non-empty input\n    feats = torch.rand(\n        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50\n    coors = torch.randint(\n        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device=device)\n\n    ref_voxel_coors = coors.unique(dim=0, sorted=True)\n    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]\n    ref_voxel_feats_mean = []\n    ref_voxel_feats_max = []\n    for ref_voxel_coor in ref_voxel_coors:\n        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)\n        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))\n        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)\n    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)\n    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)\n\n    feats_out_mean, coors_out_mean = dsmean(feats, coors)\n    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +\n                coors_out_mean[:, 2]).argsort()\n    feats_out_mean = feats_out_mean[seq_mean]\n    coors_out_mean = coors_out_mean[seq_mean]\n\n    feats_out_max, coors_out_max = dsmax(feats, coors)\n    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +\n               coors_out_max[:, 2]).argsort()\n    feats_out_max = feats_out_max[seq_max]\n    coors_cout_max = coors_out_max[seq_max]\n\n    assert (coors_out_mean == ref_voxel_coors).all()\n    assert torch.allclose(\n        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)\n    assert (coors_cout_max == ref_voxel_coors).all()\n    assert torch.allclose(\n        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)\n\n    # test non-empty input without any point out of bound\n    feats = torch.rand(\n        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50\n    coors = torch.randint(\n        low=0, high=20, size=(200000, 3), dtype=torch.int32, device=device)\n\n    ref_voxel_coors = coors.unique(dim=0, sorted=True)\n    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]\n    ref_voxel_feats_mean = []\n    ref_voxel_feats_max = []\n    for ref_voxel_coor in ref_voxel_coors:\n        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)\n        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))\n        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)\n    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)\n    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)\n\n    feats_out_mean, coors_out_mean = dsmean(feats, coors)\n    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +\n                coors_out_mean[:, 2]).argsort()\n    feats_out_mean = feats_out_mean[seq_mean]\n    coors_out_mean = coors_out_mean[seq_mean]\n\n    feats_out_max, coors_out_max = dsmax(feats, coors)\n    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +\n               coors_out_max[:, 2]).argsort()\n    feats_out_max = feats_out_max[seq_max]\n    coors_cout_max = coors_out_max[seq_max]\n\n    assert (coors_out_mean == ref_voxel_coors).all()\n    assert torch.allclose(\n        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)\n    assert (coors_cout_max == ref_voxel_coors).all()\n    assert torch.allclose(\n        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)\n\n    # test grad #\n    feats = torch.rand(\n        size=(100, 4), dtype=torch.float32, device=device) * 100 - 50\n    coors = torch.randint(\n        low=-1, high=3, size=(100, 3), dtype=torch.int32, device=device)\n    feats.requires_grad_()\n    # TODO(Cambricon): mlu only support max reduce in current version.\n    if not IS_MLU_AVAILABLE:\n        gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)\n    gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)\n"
  },
  {
    "path": "tests/test_ops/test_spconv.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nfrom torch import nn\n\nfrom mmcv.cnn import build_conv_layer, build_norm_layer\nfrom mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,\n                      SubMConv3d)\n\nif torch.__version__ == 'parrots':\n    pytest.skip('not supported in parrots now', allow_module_level=True)\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\n\ndef make_sparse_convmodule(in_channels,\n                           out_channels,\n                           kernel_size,\n                           indice_key,\n                           stride=1,\n                           padding=0,\n                           conv_type='SubMConv3d',\n                           norm_cfg=None,\n                           order=('conv', 'norm', 'act')):\n    \"\"\"Make sparse convolution module.\n\n    Args:\n        in_channels (int): the number of input channels\n        out_channels (int): the number of out channels\n        kernel_size (int|tuple(int)): kernel size of convolution\n        indice_key (str): the indice key used for sparse tensor\n        stride (int|tuple(int)): the stride of convolution\n        padding (int or list[int]): the padding number of input\n        conv_type (str): sparse conv type in spconv\n        norm_cfg (dict[str]): config of normalization layer\n        order (tuple[str]): The order of conv/norm/activation layers. It is a\n            sequence of \"conv\", \"norm\" and \"act\". Common examples are\n            (\"conv\", \"norm\", \"act\") and (\"act\", \"conv\", \"norm\").\n\n    Returns:\n        spconv.SparseSequential: sparse convolution module.\n    \"\"\"\n    assert isinstance(order, tuple) and len(order) <= 3\n    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}\n\n    conv_cfg = dict(type=conv_type, indice_key=indice_key)\n\n    layers = list()\n    for layer in order:\n        if layer == 'conv':\n            if conv_type not in [\n                    'SparseInverseConv3d', 'SparseInverseConv2d',\n                    'SparseInverseConv1d'\n            ]:\n                layers.append(\n                    build_conv_layer(\n                        conv_cfg,\n                        in_channels,\n                        out_channels,\n                        kernel_size,\n                        stride=stride,\n                        padding=padding,\n                        bias=False))\n            else:\n                layers.append(\n                    build_conv_layer(\n                        conv_cfg,\n                        in_channels,\n                        out_channels,\n                        kernel_size,\n                        bias=False))\n        elif layer == 'norm':\n            layers.append(build_norm_layer(norm_cfg, out_channels)[1])\n        elif layer == 'act':\n            layers.append(nn.ReLU(inplace=True))\n\n    layers = SparseSequential(*layers)\n    return layers\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_make_sparse_convmodule(device):\n    if IS_CUDA_AVAILABLE:\n        torch.cuda.empty_cache()\n    elif IS_MUSA_AVAILABLE:\n        torch.musa.empty_cache()\n    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],\n                                   [6.8162713, -2.480431, -1.3616394, 0.36],\n                                   [11.643568, -4.744306, -1.3580885, 0.16],\n                                   [23.482342, 6.5036807, 0.5806964, 0.35]],\n                                  dtype=torch.float32,\n                                  device=device)  # n, point_features\n    coordinates = torch.tensor(\n        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],\n         [1, 35, 930, 469]],\n        dtype=torch.int32,\n        device=device)  # n, 4(batch, ind_x, ind_y, ind_z)\n\n    # test\n    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,\n                                       [41, 1600, 1408], 2)\n\n    sparse_block0 = make_sparse_convmodule(\n        4,\n        16,\n        3,\n        'test0',\n        stride=1,\n        padding=0,\n        conv_type='SubMConv3d',\n        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),\n        order=('conv', 'norm', 'act')).to(device)\n    assert isinstance(sparse_block0[0], SubMConv3d)\n    assert sparse_block0[0].in_channels == 4\n    assert sparse_block0[0].out_channels == 16\n    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)\n    assert sparse_block0[1].eps == 0.001\n    assert sparse_block0[1].momentum == 0.01\n    assert isinstance(sparse_block0[2], torch.nn.ReLU)\n\n    # test forward\n    out_features = sparse_block0(input_sp_tensor)\n    assert out_features.features.shape == torch.Size([4, 16])\n\n    # device == mlu: not support inverse==1 yet\n    if device != 'mlu':\n        sparse_block1 = make_sparse_convmodule(\n            4,\n            16,\n            3,\n            'test1',\n            stride=1,\n            padding=0,\n            conv_type='SparseInverseConv3d',\n            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),\n            order=('norm', 'act', 'conv')).to(device)\n        assert isinstance(sparse_block1[2], SparseInverseConv3d)\n        assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)\n        assert isinstance(sparse_block1[1], torch.nn.ReLU)\n"
  },
  {
    "path": "tests/test_ops/test_syncbn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport platform\n\nimport numpy as np\nimport pytest\nimport torch\nimport torch.distributed as dist\nimport torch.nn as nn\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE\n\nif platform.system() == 'Windows':\n    import regex as re\nelse:\n    import re\n\n\nclass TestSyncBN:\n\n    def dist_init(self):\n        rank = int(os.environ['SLURM_PROCID'])\n        world_size = int(os.environ['SLURM_NTASKS'])\n        local_rank = int(os.environ['SLURM_LOCALID'])\n        node_list = str(os.environ['SLURM_NODELIST'])\n\n        node_parts = re.findall('[0-9]+', node_list)\n        os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +\n                                     f'.{node_parts[3]}.{node_parts[4]}')\n        os.environ['MASTER_PORT'] = '12341'\n        os.environ['WORLD_SIZE'] = str(world_size)\n        os.environ['RANK'] = str(rank)\n\n        if IS_CUDA_AVAILABLE:\n            dist.init_process_group('nccl')\n            torch.cuda.set_device(local_rank)\n        elif IS_MUSA_AVAILABLE:\n            dist.init_process_group('mccl')\n            torch.musa.set_device(local_rank)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def _test_syncbn_train(self, size=1, half=False, device='cuda'):\n\n        if 'SLURM_NTASKS' not in os.environ or int(\n                os.environ['SLURM_NTASKS']) != 4:\n            print('must run with slurm has 4 processes!\\n'\n                  'srun -p test --gres=gpu:4 -n4')\n            return\n        else:\n            print('Running syncbn test')\n        from mmcv.ops import SyncBatchNorm\n\n        assert size in (1, 2, 4)\n        if not dist.is_initialized():\n            self.dist_init()\n        rank = dist.get_rank()\n\n        torch.manual_seed(9)\n        if IS_CUDA_AVAILABLE:\n            torch.cuda.manual_seed(9)\n        elif IS_MUSA_AVAILABLE:\n            torch.musa.manual_seed(9)\n\n        self.x = torch.rand(16, 3, 2, 3).to(device)\n        self.y_bp = torch.rand(16, 3, 2, 3).to(device)\n\n        if half:\n            self.x = self.x.half()\n            self.y_bp = self.y_bp.half()\n        dist.broadcast(self.x, src=0)\n        dist.broadcast(self.y_bp, src=0)\n\n        if IS_CUDA_AVAILABLE:\n            torch.cuda.synchronize()\n        elif IS_MUSA_AVAILABLE:\n            torch.musa.synchronize()\n        if size == 1:\n            groups = [None, None, None, None]\n            groups[0] = dist.new_group([0])\n            groups[1] = dist.new_group([1])\n            groups[2] = dist.new_group([2])\n            groups[3] = dist.new_group([3])\n            group = groups[rank]\n        elif size == 2:\n            groups = [None, None, None, None]\n            groups[0] = groups[1] = dist.new_group([0, 1])\n            groups[2] = groups[3] = dist.new_group([2, 3])\n            group = groups[rank]\n        elif size == 4:\n            group = dist.group.WORLD\n        syncbn = SyncBatchNorm(3, group=group).to(device)\n        syncbn.weight.data[0] = 0.2\n        syncbn.weight.data[1] = 0.5\n        syncbn.weight.data[2] = 0.7\n        syncbn.train()\n\n        bn = nn.BatchNorm2d(3).to(device)\n        bn.weight.data[0] = 0.2\n        bn.weight.data[1] = 0.5\n        bn.weight.data[2] = 0.7\n        bn.train()\n\n        sx = self.x[rank * 4:rank * 4 + 4]\n        sx.requires_grad_()\n        sy = syncbn(sx)\n        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])\n\n        smean = syncbn.running_mean\n        svar = syncbn.running_var\n        sx_grad = sx.grad\n        sw_grad = syncbn.weight.grad\n        sb_grad = syncbn.bias.grad\n\n        if size == 1:\n            x = self.x[rank * 4:rank * 4 + 4]\n            y_bp = self.y_bp[rank * 4:rank * 4 + 4]\n        elif size == 2:\n            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]\n            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]\n        elif size == 4:\n            x = self.x\n            y_bp = self.y_bp\n        x.requires_grad_()\n        y = bn(x)\n        y.backward(y_bp)\n\n        if size == 2:\n            y = y[rank % 2 * 4:rank % 2 * 4 + 4]\n        elif size == 4:\n            y = y[rank * 4:rank * 4 + 4]\n\n        mean = bn.running_mean\n        var = bn.running_var\n        if size == 1:\n            x_grad = x.grad\n            w_grad = bn.weight.grad\n            b_grad = bn.bias.grad\n        elif size == 2:\n            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]\n            w_grad = bn.weight.grad / 2\n            b_grad = bn.bias.grad / 2\n        elif size == 4:\n            x_grad = x.grad[rank * 4:rank * 4 + 4]\n            w_grad = bn.weight.grad / 4\n            b_grad = bn.bias.grad / 4\n\n        assert np.allclose(mean.data.cpu().numpy(),\n                           smean.data.cpu().numpy(), 1e-3)\n        assert np.allclose(var.data.cpu().numpy(),\n                           svar.data.cpu().numpy(), 1e-3)\n        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)\n        assert np.allclose(w_grad.data.cpu().numpy(),\n                           sw_grad.data.cpu().numpy(), 1e-3)\n        assert np.allclose(b_grad.data.cpu().numpy(),\n                           sb_grad.data.cpu().numpy(), 1e-3)\n        assert np.allclose(x_grad.data.cpu().numpy(),\n                           sx_grad.data.cpu().numpy(), 1e-2)\n\n    @pytest.mark.parametrize('device', [\n        pytest.param(\n            'cuda',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'musa',\n            marks=pytest.mark.skipif(\n                not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n    ])\n    def _test_syncbn_empty_train(self, size=1, half=False, device='cuda'):\n\n        if 'SLURM_NTASKS' not in os.environ or int(\n                os.environ['SLURM_NTASKS']) != 4:\n            print('must run with slurm has 4 processes!\\n'\n                  'srun -p test --gres=gpu:4 -n4')\n            return\n        else:\n            print('Running syncbn test')\n        from mmcv.ops import SyncBatchNorm\n\n        assert size in (1, 2, 4)\n        if not dist.is_initialized():\n            self.dist_init()\n        rank = dist.get_rank()\n\n        torch.manual_seed(9)\n        if IS_CUDA_AVAILABLE:\n            torch.cuda.manual_seed(9)\n        elif IS_MUSA_AVAILABLE:\n            torch.musa.manual_seed(9)\n\n        self.x = torch.rand(0, 3, 2, 3).to(device)\n        self.y_bp = torch.rand(0, 3, 2, 3).to(device)\n\n        if half:\n            self.x = self.x.half()\n            self.y_bp = self.y_bp.half()\n        dist.broadcast(self.x, src=0)\n        dist.broadcast(self.y_bp, src=0)\n\n        if IS_CUDA_AVAILABLE:\n            torch.cuda.synchronize()\n        elif IS_MUSA_AVAILABLE:\n            torch.musa.synchronize()\n        if size == 1:\n            groups = [None, None, None, None]\n            groups[0] = dist.new_group([0])\n            groups[1] = dist.new_group([1])\n            groups[2] = dist.new_group([2])\n            groups[3] = dist.new_group([3])\n            group = groups[rank]\n        elif size == 2:\n            groups = [None, None, None, None]\n            groups[0] = groups[1] = dist.new_group([0, 1])\n            groups[2] = groups[3] = dist.new_group([2, 3])\n            group = groups[rank]\n        elif size == 4:\n            group = dist.group.WORLD\n\n        syncbn = SyncBatchNorm(3, group=group, stats_mode='N').to(device)\n        syncbn.weight.data[0] = 0.2\n        syncbn.weight.data[1] = 0.5\n        syncbn.weight.data[2] = 0.7\n        syncbn.train()\n\n        bn = nn.BatchNorm2d(3).to(device)\n        bn.weight.data[0] = 0.2\n        bn.weight.data[1] = 0.5\n        bn.weight.data[2] = 0.7\n        bn.train()\n\n        sx = self.x[rank * 4:rank * 4 + 4]\n        sx.requires_grad_()\n        sy = syncbn(sx)\n        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])\n        smean = syncbn.running_mean\n        svar = syncbn.running_var\n        sx_grad = sx.grad\n        sw_grad = syncbn.weight.grad\n        sb_grad = syncbn.bias.grad\n\n        if size == 1:\n            x = self.x[rank * 4:rank * 4 + 4]\n            y_bp = self.y_bp[rank * 4:rank * 4 + 4]\n        elif size == 2:\n            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]\n            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]\n        elif size == 4:\n            x = self.x\n            y_bp = self.y_bp\n        x.requires_grad_()\n        y = bn(x)\n        y.backward(y_bp)\n\n        if size == 2:\n            y = y[rank % 2 * 4:rank % 2 * 4 + 4]\n        elif size == 4:\n            y = y[rank * 4:rank * 4 + 4]\n\n        mean = bn.running_mean\n        var = bn.running_var\n        if size == 1:\n            x_grad = x.grad\n            w_grad = bn.weight.grad\n            b_grad = bn.bias.grad\n        elif size == 2:\n            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]\n            w_grad = bn.weight.grad / 2\n            b_grad = bn.bias.grad / 2\n        elif size == 4:\n            x_grad = x.grad[rank * 4:rank * 4 + 4]\n            w_grad = bn.weight.grad / 4\n            b_grad = bn.bias.grad / 4\n\n        assert np.allclose(mean.data.cpu().numpy(),\n                           smean.data.cpu().numpy(), 1e-3)\n        assert np.allclose(var.data.cpu().numpy(),\n                           svar.data.cpu().numpy(), 1e-3)\n        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)\n        assert np.allclose(w_grad.data.cpu().numpy(),\n                           sw_grad.data.cpu().numpy(), 1e-3)\n        assert np.allclose(b_grad.data.cpu().numpy(),\n                           sb_grad.data.cpu().numpy(), 1e-3)\n        assert np.allclose(x_grad.data.cpu().numpy(),\n                           sx_grad.data.cpu().numpy(), 1e-2)\n\n        # 'stats_mode' only allows 'default' and 'N'\n        with pytest.raises(AssertionError):\n            SyncBatchNorm(3, group=group, stats_mode='X')\n\n    def test_syncbn_1(self):\n        self._test_syncbn_train(size=1)\n\n    def test_syncbn_2(self):\n        self._test_syncbn_train(size=2)\n\n    def test_syncbn_4(self):\n        self._test_syncbn_train(size=4)\n\n    def test_syncbn_1_half(self):\n        self._test_syncbn_train(size=1, half=True)\n\n    def test_syncbn_2_half(self):\n        self._test_syncbn_train(size=2, half=True)\n\n    def test_syncbn_4_half(self):\n        self._test_syncbn_train(size=4, half=True)\n\n    def test_syncbn_empty_1(self):\n        self._test_syncbn_empty_train(size=1)\n\n    def test_syncbn_empty_2(self):\n        self._test_syncbn_empty_train(size=2)\n\n    def test_syncbn_empty_4(self):\n        self._test_syncbn_empty_train(size=4)\n\n    def test_syncbn_empty_1_half(self):\n        self._test_syncbn_empty_train(size=1, half=True)\n\n    def test_syncbn_empty_2_half(self):\n        self._test_syncbn_empty_train(size=2, half=True)\n\n    def test_syncbn_empty_4_half(self):\n        self._test_syncbn_empty_train(size=4, half=True)\n"
  },
  {
    "path": "tests/test_ops/test_three_interpolate.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import three_interpolate\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MUSA_AVAILABLE, IS_NPU_AVAILABLE\n\n\n@pytest.mark.parametrize('dtype', [\n    torch.half, torch.float,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_NPU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='NPU, MUSA does not support for 64-bit floating point'))\n])\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\ndef test_three_interpolate(dtype, device):\n    if IS_MUSA_AVAILABLE:\n        torch.musa.empty_cache()\n    features = torch.tensor(\n        [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],\n          [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],\n          [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],\n          [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],\n          [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],\n         [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],\n          [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],\n          [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],\n          [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],\n          [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],\n        dtype=dtype,\n        device=device)\n\n    idx = torch.tensor(\n        [[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2], [0, 1, 3]],\n         [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4], [0, 1, 2]]],\n        device=device).int()\n\n    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [1.0000e+00, 5.8155e-08, 2.2373e-08],\n                            [1.0000e+00, 1.7737e-08, 1.7356e-08],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01]],\n                           [[3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [1.0000e+00, 1.3651e-08, 7.7312e-09],\n                            [1.0000e+00, 1.7148e-08, 1.4070e-08],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01],\n                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]],\n                          dtype=dtype,\n                          device=device)\n\n    output = three_interpolate(features, idx, weight)\n    expected_output = torch.tensor([[[\n        3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00\n    ], [\n        2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00\n    ], [\n        2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00\n    ], [\n        4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00\n    ], [\n        2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01\n    ]],\n                                    [[\n                                        8.1773e-01, 9.5440e-01, 2.4532e+00,\n                                        8.1773e-01, 8.1773e-01, 1.1359e+00\n                                    ],\n                                     [\n                                         8.4689e-01, 1.9176e+00, 1.4715e+00,\n                                         8.4689e-01, 8.4689e-01, 1.3079e+00\n                                     ],\n                                     [\n                                         6.9473e-01, 2.7440e-01, 2.0842e+00,\n                                         6.9473e-01, 6.9473e-01, 7.8619e-01\n                                     ],\n                                     [\n                                         7.6789e-01, 1.5063e+00, 1.6209e+00,\n                                         7.6789e-01, 7.6789e-01, 1.1562e+00\n                                     ],\n                                     [\n                                         3.8760e-01, 1.0300e-02, 8.3569e-09,\n                                         3.8760e-01, 3.8760e-01, 1.9723e-01\n                                     ]]],\n                                   dtype=dtype,\n                                   device=device)\n\n    assert torch.allclose(output, expected_output, 1e-3, 1e-4)\n"
  },
  {
    "path": "tests/test_ops/test_three_nn.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\nfrom mmcv.ops import three_nn\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\nknown = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],\n          [-0.6503, 3.6637, -1.0622], [-1.8373, 3.5605, -0.7867],\n          [-1.8373, 3.5605, -0.7867]],\n         [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],\n          [0.0858, 2.4721, -0.1928], [-1.3399, 1.9991, -0.3698],\n          [-1.3399, 1.9991, -0.3698]]]\n\nunknown = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],\n            [-0.6503, 3.6637, -1.0622], [-1.5237, 2.3976, -0.8097],\n            [-0.0722, 3.4017, -0.2880], [0.5198, 3.0661, -0.4605],\n            [-2.0185, 3.5019, -0.3236], [0.5098, 3.1020, 0.5799],\n            [-1.6137, 3.8443, -0.5269], [0.7341, 2.9626, -0.3189]],\n           [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],\n            [0.0858, 2.4721, -0.1928], [-0.9022, 1.6560, -1.3090],\n            [0.1156, 1.6901, -0.4366], [-0.6477, 2.3576, -0.1563],\n            [-0.8482, 1.1466, -1.2704], [-0.8753, 2.0845, -0.3460],\n            [-0.5621, 1.4233, -1.2858], [-0.5883, 1.3114, -1.2899]]]\n\nexpected_dist = [[[0.0000, 0.0000, 0.0000], [0.0000, 2.0463, 2.8588],\n                  [0.0000, 1.2229, 1.2229], [1.2047, 1.2047, 1.2047],\n                  [1.0011, 1.0845, 1.8411], [0.7433, 1.4451, 2.4304],\n                  [0.5007, 0.5007, 0.5007], [0.4587, 2.0875, 2.7544],\n                  [0.4450, 0.4450, 0.4450], [0.5514, 1.7206, 2.6811]],\n                 [[0.0000, 0.0000, 0.0000], [0.0000, 1.6464, 1.6952],\n                  [0.0000, 1.5125, 1.5125], [1.0915, 1.0915, 1.0915],\n                  [0.8197, 0.8511, 1.4894], [0.7433, 0.8082, 0.8082],\n                  [0.8955, 1.3340, 1.3340], [0.4730, 0.4730, 0.4730],\n                  [0.7949, 1.3325, 1.3325], [0.7566, 1.3727, 1.3727]]]\n\nexpected_idx = [[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],\n                 [1, 2, 0], [0, 3, 4], [1, 2, 0], [0, 3, 4], [1, 2, 0]],\n                [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],\n                 [2, 0, 3], [1, 0, 3], [0, 3, 4], [1, 0, 3], [1, 0, 3]]]\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support'))\n])\n@pytest.mark.parametrize('dtype,rtol', [(torch.float, 1e-8),\n                                        (torch.half, 1e-3)])\ndef test_three_nn(device, dtype, rtol):\n    dtype = torch.float\n    known_t = torch.tensor(known, dtype=dtype, device=device)\n    unknown_t = torch.tensor(unknown, dtype=dtype, device=device)\n\n    dist_t, idx_t = three_nn(unknown_t, known_t)\n    expected_dist_t = torch.tensor(expected_dist, dtype=dtype, device=device)\n    expected_idx_t = torch.tensor(expected_idx, device=device)\n\n    assert torch.allclose(dist_t, expected_dist_t, atol=1e-4, rtol=rtol)\n    assert torch.all(idx_t == expected_idx_t)\n"
  },
  {
    "path": "tests/test_ops/test_tin_shift.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\n\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck\n\n    _USING_PARROTS = False\n\ncur_dir = os.path.dirname(os.path.abspath(__file__))\n\ninputs = ([[[[0.88572276, 0.46422583], [0.97408265, 0.59547687],\n             [0.030812204, 0.96236038], [0.75418317, 0.44058233],\n             [0.33279222, 0.00084149837], [0.7069388, 0.23255438],\n             [0.13547045, 0.81549376], [0.40174931, 0.36317211]],\n            [[0.57444429, 0.15905505], [0.39897251, 0.25790238],\n             [0.93282568, 0.18451685], [0.92526674, 0.18283755],\n             [0.31664443, 0.59323865], [0.1957739, 0.42505842],\n             [0.081158757, 0.81340349], [0.43456328, 0.30195212]],\n            [[0.8198145, 0.05990988], [0.98062474, 0.34803438],\n             [0.10412294, 0.37183142], [0.15021622, 0.038857818],\n             [0.40985721, 0.42253625], [0.71150124, 0.59778064],\n             [0.83851069, 0.15194464], [0.097513378, 0.74820143]],\n            [[0.80680406, 0.49327564], [0.17821097, 0.12980539],\n             [0.50657678, 0.14446253], [0.04178369, 0.53071898],\n             [0.84983683, 0.3826949], [0.32193625, 0.91275406],\n             [0.75628334, 0.52934098], [0.27994192, 0.3053292]]],\n           [[[0.082397044, 0.4210068], [0.23563534, 0.7938987],\n             [0.63669145, 0.69397897], [0.8844561, 0.97854084],\n             [0.79027033, 0.60640401], [0.63528901, 0.72172403],\n             [0.0097346902, 0.70800996], [0.87891227, 0.13674974]],\n            [[0.74329448, 0.0243572], [0.82178867, 0.85750699],\n             [0.7568835, 0.73146772], [0.5031184, 0.30479157],\n             [0.28713053, 0.47414285], [0.4682079, 0.067471564],\n             [0.48368263, 0.14590704], [0.25397325, 0.19946373]],\n            [[0.4291026, 0.068739474], [0.7159555, 0.79903615],\n             [0.76412082, 0.85348046], [0.081224024, 0.82264912],\n             [0.97173303, 0.24291694], [0.48957139, 0.43488795],\n             [0.67382395, 0.21889746], [0.36712623, 0.67127824]],\n            [[0.12054044, 0.18096751], [0.86675781, 0.54755616],\n             [0.68208277, 0.15164375], [0.79991871, 0.80811197],\n             [0.85256428, 0.68253738], [0.185983, 0.95642138],\n             [0.48102546, 0.28009653], [0.35726011, 0.58168036]]]])\n\nshifts = [([[1, 0, 1, -2], [-2, 1, -1, 1]]), ([[2, 1, 2, -1], [-1, 2, 0, 2]])]\n\noutputs = [([[[[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],\n               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],\n              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],\n               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],\n               [0.75628, 0.52934], [0.27994, 0.30533]],\n              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],\n               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],\n               [0.0, 0.0], [0.0, 0.0]],\n              [[0.81981, 0.05991], [0.98062, 0.34803], [0.50658, 0.14446],\n               [0.041784, 0.53072], [0.40986, 0.42254], [0.7115, 0.59778],\n               [0.0, 0.0], [0.0, 0.0]]],\n             [[[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],\n               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,\n                                                                     0.0]],\n              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],\n               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],\n               [0.0097347, 0.70801], [0.87891, 0.13675]],\n              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],\n               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],\n               [0.25397, 0.19946]],\n              [[0.0, 0.0], [0.0, 0.0], [0.76412, 0.85348], [0.081224, 0.82265],\n               [0.0, 0.0], [0.0, 0.0], [0.67382, 0.2189], [0.36713,\n                                                           0.67128]]]]),\n           ([[[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0],\n               [0.0, 0.0], [0.081159, 0.8134], [0.43456, 0.30195]],\n              [[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],\n               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],\n              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],\n               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],\n               [0.75628, 0.52934], [0.27994, 0.30533]],\n              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],\n               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],\n               [0.0, 0.0], [0.0, 0.0]]],\n             [[[0.74329, 0.024357], [0.82179, 0.85751], [0.0, 0.0], [0.0, 0.0],\n               [0.79027, 0.6064], [0.63529, 0.72172], [0.0, 0.0], [0.0, 0.0]],\n              [[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],\n               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,\n                                                                     0.0]],\n              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],\n               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],\n               [0.0097347, 0.70801], [0.87891, 0.13675]],\n              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],\n               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],\n               [0.25397, 0.19946]]]])]\n\ngrads = [\n    [[[[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],\n       [1., 1.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]]],\n     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]],\n      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]],\n      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],\n       [1., 1.]]]],\n    [[[[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [1., 1.],\n       [1., 1.]],\n      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],\n       [1., 1.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]]],\n     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]],\n      [[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],\n       [0., 0.]],\n      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]],\n      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],\n       [1., 1.]]]]\n]\n\n\ndef _test_tinshift_gradcheck(device, dtype):\n    try:\n        from mmcv.ops import tin_shift\n    except ModuleNotFoundError:\n        pytest.skip('TINShift op is not successfully compiled')\n\n    if dtype == torch.half:\n        pytest.skip('\"add_cpu/sub_cpu\" not implemented for Half')\n\n    for shift in shifts:\n        np_input = np.array(inputs)\n        np_shift = np.array(shift)\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        shift = torch.tensor(np_shift, device=device).int()\n        if torch.__version__ == 'parrots':\n            gradcheck(tin_shift, (x, shift))\n        else:\n            gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)\n\n\ndef _test_tinshift_allclose(device, dtype):\n    try:\n        from mmcv.ops import tin_shift\n    except ModuleNotFoundError:\n        pytest.skip('TINShift op is not successfully compiled')\n\n    for shift, output, grad in zip(shifts, outputs, grads):\n        np_input = np.array(inputs)\n        np_shift = np.array(shift)\n        np_output = np.array(output)\n        np_grad = np.array(grad)\n\n        x = torch.tensor(\n            np_input, dtype=dtype, device=device, requires_grad=True)\n        shift = torch.tensor(np_shift, device=device).int()\n\n        output = tin_shift(x, shift)\n        output.backward(torch.ones_like(output))\n        assert np.allclose(\n            output.data.type(torch.float).cpu().numpy(), np_output, 1e-3)\n        assert np.allclose(\n            x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)\n\n\ndef _test_tinshift_assert(device, dtype):\n    try:\n        from mmcv.ops import tin_shift\n    except ModuleNotFoundError:\n        pytest.skip('TINShift op is not successfully compiled')\n\n    inputs = [\n        torch.rand(2, 3, 4, 2),\n        torch.rand(2, 3, 4, 2),\n        torch.rand(1, 3, 4, 2)\n    ]\n    shifts = [torch.rand(2, 3), torch.rand(2, 5)]\n\n    for x, shift in zip(inputs, shifts):\n        x = x.to(device).type(dtype)\n        shift = shift.to(device).type(dtype)\n\n        # A ValueError should be raised if ops get inputs with wrong shapes.\n        with pytest.raises(ValueError):\n            tin_shift(x, shift)\n\n\n@pytest.mark.parametrize('device', [\n    pytest.param(\n        'cuda',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n    pytest.param(\n        'mlu',\n        marks=pytest.mark.skipif(\n            not IS_MLU_AVAILABLE, reason='requires MLU support')),\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\n@pytest.mark.parametrize('dtype', [\n    torch.float,\n    pytest.param(\n        torch.double,\n        marks=pytest.mark.skipif(\n            IS_MLU_AVAILABLE or IS_MUSA_AVAILABLE,\n            reason='MLU, MUSA does not support for 64-bit floating point')),\n    torch.half\n])\ndef test_tinshift(device, dtype):\n    _test_tinshift_allclose(device=device, dtype=dtype)\n    _test_tinshift_gradcheck(device=device, dtype=dtype)\n    _test_tinshift_assert(device=device, dtype=dtype)\n"
  },
  {
    "path": "tests/test_ops/test_upfirdn2d.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\n\n_USING_PARROTS = True\ntry:\n    from parrots.autograd import gradcheck\nexcept ImportError:\n    from torch.autograd import gradcheck, gradgradcheck\n    _USING_PARROTS = False\n\n\nclass TestUpFirDn2d:\n    \"\"\"Unit test for UpFirDn2d.\n\n    Here, we just test the basic case of upsample version. More gerneal tests\n    will be included in other unit test for UpFirDnUpsample and\n    UpFirDnDownSample modules.\n    \"\"\"\n\n    @classmethod\n    def setup_class(cls):\n        kernel_1d = torch.tensor([1., 3., 3., 1.])\n        cls.kernel = kernel_1d[:, None] * kernel_1d[None, :]\n        cls.kernel = cls.kernel / cls.kernel.sum()\n        cls.factor = 2\n        pad = cls.kernel.shape[0] - cls.factor\n        cls.pad = ((pad + 1) // 2 + cls.factor - 1, pad // 2)\n\n        cls.input_tensor = torch.randn((2, 3, 4, 4), requires_grad=True)\n\n    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')\n    def test_upfirdn2d(self):\n        from mmcv.ops import upfirdn2d\n        if _USING_PARROTS:\n            gradcheck(\n                upfirdn2d,\n                (self.input_tensor.cuda(),\n                 self.kernel.type_as(\n                     self.input_tensor).cuda(), self.factor, 1, self.pad),\n                delta=1e-4,\n                pt_atol=1e-3)\n        else:\n            gradcheck(\n                upfirdn2d,\n                (self.input_tensor.cuda(),\n                 self.kernel.type_as(\n                     self.input_tensor).cuda(), self.factor, 1, self.pad),\n                eps=1e-4,\n                atol=1e-3)\n\n            gradgradcheck(\n                upfirdn2d,\n                (self.input_tensor.cuda(),\n                 self.kernel.type_as(\n                     self.input_tensor).cuda(), self.factor, 1, self.pad),\n                eps=1e-4,\n                atol=1e-3)\n\n        # test with different up\n        kernel = torch.randn(3, 3)\n        out = upfirdn2d(\n            self.input_tensor.cuda(), filter=kernel.cuda(), up=2, padding=1)\n        assert out.shape == (2, 3, 8, 8)\n\n        # test with different down\n        input_tensor = torch.randn(2, 3, 8, 8)\n        out = upfirdn2d(\n            input_tensor.cuda(), filter=self.kernel.cuda(), down=2, padding=1)\n        assert out.shape == (2, 3, 4, 4)\n\n        # test with different flip_filter\n        out = upfirdn2d(\n            self.input_tensor.cuda(),\n            filter=self.kernel.cuda(),\n            flip_filter=True)\n        assert out.shape == (2, 3, 1, 1)\n\n        # test with different gain\n        out1 = upfirdn2d(\n            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.2)\n        out2 = upfirdn2d(\n            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.1)\n        assert torch.allclose(out1, out2 * 2)\n"
  },
  {
    "path": "tests/test_ops/test_voxelization.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\nimport torch\n\nfrom mmcv.ops import Voxelization\nfrom mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MUSA_AVAILABLE,\n                        IS_NPU_AVAILABLE)\n\n\ndef _get_voxel_points_indices(points, coors, voxel):\n    result_form = np.equal(coors, voxel)\n    return result_form[:, 0] & result_form[:, 1] & result_form[:, 2]\n\n\n@pytest.mark.parametrize('device_type', [\n    'cpu',\n    pytest.param(\n        'cuda:0',\n        marks=pytest.mark.skipif(\n            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))\n])\ndef test_voxelization(device_type):\n    voxel_size = [0.5, 0.5, 0.5]\n    point_cloud_range = [0, -40, -3, 70.4, 40, 1]\n\n    voxel_dict = np.load(\n        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()\n    expected_coors = voxel_dict['coors']\n    expected_voxels = voxel_dict['voxels']\n    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']\n    points = voxel_dict['points']\n\n    points = torch.tensor(points)\n    max_num_points = -1\n    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                        max_num_points)\n    max_num_points = 1000\n    hard_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                     max_num_points)\n\n    device = torch.device(device_type)\n\n    # test hard_voxelization on cpu/gpu\n    points = points.contiguous().to(device)\n    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy()\n    voxels = voxels.cpu().detach().numpy()\n    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()\n    assert np.all(coors == expected_coors)\n    assert np.all(voxels == expected_voxels)\n    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)\n\n    # test dynamic_voxelization on cpu/gpu\n    coors = dynamic_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy()\n    points = points.cpu().detach().numpy()\n    for i in range(expected_voxels.shape[0]):\n        indices = _get_voxel_points_indices(points, coors, expected_voxels[i])\n        num_points_current_voxel = points[indices].shape[0]\n        assert num_points_current_voxel > 0\n        assert np.all(\n            points[indices] == expected_coors[i][:num_points_current_voxel])\n        assert num_points_current_voxel == expected_num_points_per_voxel[i]\n\n\n@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')\ndef test_voxelization_nondeterministic():\n    voxel_size = [0.5, 0.5, 0.5]\n    point_cloud_range = [0, -40, -3, 70.4, 40, 1]\n\n    voxel_dict = np.load(\n        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()\n    points = voxel_dict['points']\n\n    points = torch.tensor(points)\n    max_num_points = -1\n    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                        max_num_points)\n\n    max_num_points = 10\n    max_voxels = 50\n    hard_voxelization = Voxelization(\n        voxel_size,\n        point_cloud_range,\n        max_num_points,\n        max_voxels,\n        deterministic=False)\n\n    # test hard_voxelization (non-deterministic version) on gpu\n    points = torch.tensor(points).contiguous().to(device='cuda:0')\n    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy().tolist()\n    voxels = voxels.cpu().detach().numpy().tolist()\n    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()\n\n    coors_all = dynamic_voxelization.forward(points)\n    coors_all = coors_all.cpu().detach().numpy().tolist()\n\n    coors_set = {tuple(c) for c in coors}\n    coors_all_set = {tuple(c) for c in coors_all}\n\n    assert len(coors_set) == len(coors)\n    assert len(coors_set - coors_all_set) == 0\n\n    points = points.cpu().detach().numpy().tolist()\n\n    coors_points_dict = {}\n    for c, ps in zip(coors_all, points):\n        if tuple(c) not in coors_points_dict:\n            coors_points_dict[tuple(c)] = set()\n        coors_points_dict[tuple(c)].add(tuple(ps))\n\n    for c, ps, n in zip(coors, voxels, num_points_per_voxel):\n        ideal_voxel_points_set = coors_points_dict[tuple(c)]\n        voxel_points_set = {tuple(p) for p in ps[:n]}\n        assert len(voxel_points_set) == n\n        if n < max_num_points:\n            assert voxel_points_set == ideal_voxel_points_set\n            for p in ps[n:]:\n                assert max(p) == min(p) == 0\n        else:\n            assert len(voxel_points_set - ideal_voxel_points_set) == 0\n\n    # test hard_voxelization (non-deterministic version) on gpu\n    # with all input point in range\n    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]\n    coors_all = dynamic_voxelization.forward(points)\n    valid_mask = coors_all.ge(0).all(-1)\n    points = points[valid_mask]\n    coors_all = coors_all[valid_mask]\n    coors_all = coors_all.cpu().detach().numpy().tolist()\n\n    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy().tolist()\n\n    coors_set = {tuple(c) for c in coors}\n    coors_all_set = {tuple(c) for c in coors_all}\n\n    assert len(coors_set) == len(coors) == len(coors_all_set)\n\n\n@pytest.mark.parametrize(\n    'device_type',\n    [\n        pytest.param(\n            # this is only used for dipu device testing case.\n            # dipu will mock to cuda automatically on mlu physical device.\n            'cuda:0',\n            marks=pytest.mark.skipif(\n                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),\n        pytest.param(\n            'mlu',\n            marks=pytest.mark.skipif(\n                not IS_MLU_AVAILABLE, reason='requires MLU support'))\n    ])\ndef test_voxelization_mlu(device_type):\n    voxel_size = [0.5, 0.5, 0.5]\n    point_cloud_range = [0, -40, -3, 70.4, 40, 1]\n\n    voxel_dict = np.load(\n        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()\n    expected_coors = voxel_dict['coors']\n    expected_voxels = voxel_dict['voxels']\n    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']\n    points = voxel_dict['points']\n\n    points = torch.tensor(points)\n    max_num_points = 1000\n    hard_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                     max_num_points)\n\n    device = torch.device(device_type)\n\n    # test hard_voxelization on mlu\n    points = points.contiguous().to(device)\n    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy()\n    voxels = voxels.cpu().detach().numpy()\n    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()\n    assert np.all(coors == expected_coors)\n    assert np.all(voxels == expected_voxels)\n    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)\n\n\n@pytest.mark.parametrize('device_type', [\n    pytest.param(\n        'npu',\n        marks=pytest.mark.skipif(\n            not IS_NPU_AVAILABLE, reason='requires NPU support'))\n])\ndef test_voxelization_npu(device_type):\n    voxel_size = [0.5, 0.5, 0.5]\n    point_cloud_range = [0, -40, -3, 70.4, 40, 1]\n\n    voxel_dict = np.load(\n        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()\n    expected_coors = voxel_dict['coors']\n    expected_voxels = voxel_dict['voxels']\n    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']\n    points = voxel_dict['points']\n\n    points = torch.tensor(points)\n    max_num_points = 1000\n    hard_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                     max_num_points)\n\n    device = torch.device(device_type)\n\n    # test hard_voxelization on npu\n    points = points.contiguous().to(device)\n    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy()\n    voxels = voxels.cpu().detach().numpy()\n    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()\n    assert np.all(coors == expected_coors)\n    assert np.all(voxels == expected_voxels)\n    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)\n\n\n@pytest.mark.parametrize('device_type', [\n    pytest.param(\n        'musa',\n        marks=pytest.mark.skipif(\n            not IS_MUSA_AVAILABLE, reason='requires MUSA support')),\n])\ndef test_voxelization_musa(device_type):\n    voxel_size = [0.5, 0.5, 0.5]\n    point_cloud_range = [0, -40, -3, 70.4, 40, 1]\n\n    voxel_dict = np.load(\n        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()\n    expected_coors = voxel_dict['coors']\n    expected_voxels = voxel_dict['voxels']\n    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']\n    points = voxel_dict['points']\n\n    points = torch.tensor(points)\n    max_num_points = 1000\n    hard_voxelization = Voxelization(voxel_size, point_cloud_range,\n                                     max_num_points)\n\n    device = torch.device(device_type)\n\n    # test hard_voxelization on mlu\n    points = points.contiguous().to(device)\n    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)\n    coors = coors.cpu().detach().numpy()\n    voxels = voxels.cpu().detach().numpy()\n    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()\n    assert np.all(coors == expected_coors)\n    assert np.all(voxels == expected_voxels)\n    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)\n"
  },
  {
    "path": "tests/test_transforms/test_transforms_formatting.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\ntry:\n    import torch\nexcept ModuleNotFoundError:\n    torch = None\nelse:\n    from mmcv.transforms import ToTensor, to_tensor, ImageToTensor\n\nimport copy\n\nimport numpy as np\nimport pytest\n\n\n@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')\ndef test_to_tensor():\n\n    # The type of the input object is torch.Tensor\n    data_tensor = torch.tensor([1, 2, 3])\n    tensor_from_tensor = to_tensor(data_tensor)\n    assert isinstance(tensor_from_tensor, torch.Tensor)\n\n    # The type of the input object is numpy.ndarray\n    data_numpy = np.array([1, 2, 3])\n    tensor_from_numpy = to_tensor(data_numpy)\n    assert isinstance(tensor_from_numpy, torch.Tensor)\n\n    # The type of the input object is list\n    data_list = [1, 2, 3]\n    tensor_from_list = to_tensor(data_list)\n    assert isinstance(tensor_from_list, torch.Tensor)\n\n    # The type of the input object is int\n    data_int = 1\n    tensor_from_int = to_tensor(data_int)\n    assert isinstance(tensor_from_int, torch.Tensor)\n\n    # The type of the input object is float\n    data_float = 1.0\n    tensor_from_float = to_tensor(data_float)\n    assert isinstance(tensor_from_float, torch.Tensor)\n\n    # The type of the input object is invalid\n    with pytest.raises(TypeError):\n        data_str = '123'\n        _ = to_tensor(data_str)\n\n\n@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')\nclass TestToTensor:\n\n    def test_init(self):\n        TRANSFORM = ToTensor(keys=['img_label'])\n        assert TRANSFORM.keys == ['img_label']\n\n    def test_transform(self):\n        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])\n\n        # Test multi-level key and single-level key (multi-level key is\n        # not in results)\n        with pytest.raises(KeyError):\n            results = {'instances': {'label': [1]}, 'img_label': [1]}\n            results_tensor = TRANSFORMS.transform(copy.deepcopy(results))\n            assert isinstance(results_tensor['instances']['label'], list)\n            assert isinstance(results_tensor['img_label'], torch.Tensor)\n\n        # Test multi-level key (multi-level key is in results)\n        results = {'instances': {'bbox': [[0, 0, 10, 10]]}, 'img_label': [1]}\n        results_tensor = TRANSFORMS.transform(copy.deepcopy(results))\n        assert isinstance(results_tensor['instances']['bbox'], torch.Tensor)\n\n    def test_repr(self):\n        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])\n        TRANSFORMS_str = str(TRANSFORMS)\n        isinstance(TRANSFORMS_str, str)\n\n\n@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')\nclass TestImageToTensor:\n\n    def test_init(self):\n        TRANSFORMS = ImageToTensor(['img'])\n        assert TRANSFORMS.keys == ['img']\n\n    def test_transform(self):\n        TRANSFORMS = ImageToTensor(['img'])\n\n        # image only has one channel\n        results = {'img': np.zeros((224, 224))}\n        results = TRANSFORMS.transform(results)\n        assert results['img'].shape == (1, 224, 224)\n\n        # image has three channels\n        results = {'img': np.zeros((224, 224, 3))}\n        results = TRANSFORMS.transform(results)\n        assert results['img'].shape == (3, 224, 224)\n\n    def test_repr(self):\n        TRANSFORMS = ImageToTensor(['img'])\n        TRANSFORMS_str = str(TRANSFORMS)\n        assert isinstance(TRANSFORMS_str, str)\n"
  },
  {
    "path": "tests/test_transforms/test_transforms_loading.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport os.path as osp\n\nimport numpy as np\nimport pytest\n\nfrom mmcv.transforms import LoadAnnotations, LoadImageFromFile\n\n\nclass TestLoadImageFromFile:\n\n    def test_load_img(self):\n        # file_client_args and backend_args can not be both set\n        with pytest.raises(\n                ValueError,\n                match='\"file_client_args\" and \"backend_args\" cannot be set'):\n            LoadImageFromFile(\n                file_client_args={'backend': 'disk'},\n                backend_args={'backend': 'disk'})\n        data_prefix = osp.join(osp.dirname(__file__), '../data')\n\n        results = dict(img_path=osp.join(data_prefix, 'color.jpg'))\n        transform = LoadImageFromFile()\n        results = transform(copy.deepcopy(results))\n        assert results['img_path'] == osp.join(data_prefix, 'color.jpg')\n        assert results['img'].shape == (300, 400, 3)\n        assert results['img'].dtype == np.uint8\n        assert results['img_shape'] == (300, 400)\n        assert results['ori_shape'] == (300, 400)\n        assert repr(transform) == transform.__class__.__name__ + \\\n            \"(ignore_empty=False, to_float32=False, color_type='color', \" + \\\n            \"imdecode_backend='cv2', backend_args=None)\"\n\n        # to_float32\n        transform = LoadImageFromFile(to_float32=True)\n        results = transform(copy.deepcopy(results))\n        assert results['img'].dtype == np.float32\n\n        # gray image\n        results = dict(img_path=osp.join(data_prefix, 'grayscale.jpg'))\n        transform = LoadImageFromFile()\n        results = transform(copy.deepcopy(results))\n        assert results['img'].shape == (300, 400, 3)\n        assert results['img'].dtype == np.uint8\n\n        transform = LoadImageFromFile(color_type='unchanged')\n        results = transform(copy.deepcopy(results))\n        assert results['img'].shape == (300, 400)\n        assert results['img'].dtype == np.uint8\n\n        # test load empty\n        fake_img_path = osp.join(data_prefix, 'fake.jpg')\n        results['img_path'] = fake_img_path\n        transform = LoadImageFromFile(ignore_empty=False)\n        with pytest.raises(FileNotFoundError):\n            transform(copy.deepcopy(results))\n        transform = LoadImageFromFile(ignore_empty=True)\n        assert transform(copy.deepcopy(results)) is None\n\n\nclass TestLoadAnnotations:\n\n    def setup_class(cls):\n        data_prefix = osp.join(osp.dirname(__file__), '../data')\n        seg_map = osp.join(data_prefix, 'grayscale.jpg')\n        cls.results = {\n            'seg_map_path':\n            seg_map,\n            'instances': [{\n                'bbox': [0, 0, 10, 20],\n                'bbox_label': 1,\n                'keypoints': [1, 2, 3]\n            }, {\n                'bbox': [10, 10, 110, 120],\n                'bbox_label': 2,\n                'keypoints': [4, 5, 6]\n            }]\n        }\n\n    def test_init(self):\n        # file_client_args and backend_args can not be both set\n        with pytest.raises(\n                ValueError,\n                match='\"file_client_args\" and \"backend_args\" cannot be set'):\n            LoadAnnotations(\n                file_client_args={'backend': 'disk'},\n                backend_args={'backend': 'disk'})\n\n    def test_load_bboxes(self):\n        transform = LoadAnnotations(\n            with_bbox=True,\n            with_label=False,\n            with_seg=False,\n            with_keypoints=False,\n        )\n        results = transform(copy.deepcopy(self.results))\n        assert 'gt_bboxes' in results\n        assert (results['gt_bboxes'] == np.array([[0, 0, 10, 20],\n                                                  [10, 10, 110, 120]])).all()\n        assert results['gt_bboxes'].dtype == np.float32\n\n    def test_load_labels(self):\n        transform = LoadAnnotations(\n            with_bbox=False,\n            with_label=True,\n            with_seg=False,\n            with_keypoints=False,\n        )\n        results = transform(copy.deepcopy(self.results))\n        assert 'gt_bboxes_labels' in results\n        assert (results['gt_bboxes_labels'] == np.array([1, 2])).all()\n        assert results['gt_bboxes_labels'].dtype == np.int64\n\n    def test_load_kps(self):\n        transform = LoadAnnotations(\n            with_bbox=False,\n            with_label=False,\n            with_seg=False,\n            with_keypoints=True,\n        )\n        results = transform(copy.deepcopy(self.results))\n        assert 'gt_keypoints' in results\n        assert (results['gt_keypoints'] == np.array([[[1, 2, 3]],\n                                                     [[4, 5, 6]]])).all()\n        assert results['gt_keypoints'].dtype == np.float32\n\n    def test_load_seg_map(self):\n        transform = LoadAnnotations(\n            with_bbox=False,\n            with_label=False,\n            with_seg=True,\n            with_keypoints=False,\n        )\n        results = transform(copy.deepcopy(self.results))\n        assert 'gt_seg_map' in results\n        assert results['gt_seg_map'].shape[:2] == (300, 400)\n        assert results['gt_seg_map'].dtype == np.uint8\n\n    def test_repr(self):\n        transform = LoadAnnotations(\n            with_bbox=True,\n            with_label=False,\n            with_seg=False,\n            with_keypoints=False,\n        )\n        assert repr(transform) == (\n            'LoadAnnotations(with_bbox=True, '\n            'with_label=False, with_seg=False, '\n            \"with_keypoints=False, imdecode_backend='cv2', \"\n            'backend_args=None)')\n"
  },
  {
    "path": "tests/test_transforms/test_transforms_processing.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport copy\nimport os.path as osp\nfrom unittest.mock import Mock\n\nimport numpy as np\nimport pytest\n\nimport mmcv\nfrom mmcv.transforms import (TRANSFORMS, Normalize, Pad, RandomFlip,\n                             RandomResize, Resize, TestTimeAug)\nfrom mmcv.transforms.base import BaseTransform\n\ntry:\n    import torch\nexcept ModuleNotFoundError:\n    torch = None\nelse:\n    import torchvision\n\nfrom numpy.testing import assert_array_almost_equal, assert_array_equal\nfrom PIL import Image\n\n\nclass TestNormalize:\n\n    def test_normalize(self):\n        img_norm_cfg = dict(\n            mean=[123.675, 116.28, 103.53],\n            std=[58.395, 57.12, 57.375],\n            to_rgb=True)\n        transform = Normalize(**img_norm_cfg)\n        results = dict()\n        img = mmcv.imread(\n            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')\n        original_img = copy.deepcopy(img)\n        results['img'] = img\n        results = transform(results)\n        mean = np.array(img_norm_cfg['mean'])\n        std = np.array(img_norm_cfg['std'])\n        converted_img = (original_img[..., ::-1] - mean) / std\n        assert np.allclose(results['img'], converted_img)\n\n    def test_repr(self):\n        img_norm_cfg = dict(\n            mean=[123.675, 116.28, 103.53],\n            std=[58.395, 57.12, 57.375],\n            to_rgb=True)\n        transform = Normalize(**img_norm_cfg)\n        assert repr(transform) == ('Normalize(mean=[123.675 116.28  103.53 ], '\n                                   'std=[58.395 57.12  57.375], to_rgb=True)')\n\n\nclass TestResize:\n\n    def test_resize(self):\n        data_info = dict(\n            img=np.random.random((1333, 800, 3)),\n            gt_seg_map=np.random.random((1333, 800, 3)),\n            gt_bboxes=np.array([[0, 0, 112, 112]]),\n            gt_keypoints=np.array([[[20, 50, 1]]]))\n\n        with pytest.raises(AssertionError):\n            transform = Resize(scale=None, scale_factor=None)\n        with pytest.raises(TypeError):\n            transform = Resize(scale_factor=[])\n        # test scale is int\n        transform = Resize(scale=2000)\n        results = transform(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2000, 2000)\n        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)\n\n        # test scale is tuple\n        transform = Resize(scale=(2000, 2000))\n        results = transform(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2000, 2000)\n        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)\n\n        # test scale_factor is float\n        transform = Resize(scale_factor=2.0)\n        results = transform(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2666, 1600)\n        assert results['scale_factor'] == (2.0, 2.0)\n\n        # test scale_factor is tuple\n        transform = Resize(scale_factor=(1.5, 2))\n        results = transform(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2666, 1200)\n        assert results['scale_factor'] == (1.5, 2)\n\n        # test keep_ratio is True\n        transform = Resize(scale=(2000, 2000), keep_ratio=True)\n        results = transform(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2000, 1200)\n        assert results['scale_factor'] == (1200 / 800, 2000 / 1333)\n\n        # test resize_bboxes/seg/kps\n        transform = Resize(scale_factor=(1.5, 2))\n        results = transform(copy.deepcopy(data_info))\n        assert (results['gt_bboxes'] == np.array([[0, 0, 168, 224]])).all()\n        assert (results['gt_keypoints'] == np.array([[[30, 100, 1]]])).all()\n        assert results['gt_seg_map'].shape[:2] == (2666, 1200)\n\n        # test clip_object_border = False\n        data_info = dict(\n            img=np.random.random((300, 400, 3)),\n            gt_bboxes=np.array([[200, 150, 600, 450]]))\n        transform = Resize(scale=(200, 150), clip_object_border=False)\n        results = transform(data_info)\n        assert (results['gt_bboxes'] == np.array([100, 75, 300, 225])).all()\n\n    def test_repr(self):\n        transform = Resize(scale=(2000, 2000), keep_ratio=True)\n        assert repr(transform) == ('Resize(scale=(2000, 2000), '\n                                   'scale_factor=None, keep_ratio=True, '\n                                   'clip_object_border=True), backend=cv2), '\n                                   'interpolation=bilinear)')\n\n\nclass TestPad:\n\n    def test_pad(self):\n        # test size and size_divisor are both set\n        with pytest.raises(AssertionError):\n            Pad(size=(10, 10), size_divisor=2)\n\n        # test size and size_divisor are both None\n        with pytest.raises(AssertionError):\n            Pad(size=None, size_divisor=None)\n\n        # test size and pad_to_square are both None\n        with pytest.raises(AssertionError):\n            Pad(size=(10, 10), pad_to_square=True)\n\n        # test pad_val is not int or tuple\n        with pytest.raises(AssertionError):\n            Pad(size=(10, 10), pad_val=[])\n\n        # test padding_mode is not 'constant', 'edge', 'reflect' or 'symmetric'\n        with pytest.raises(AssertionError):\n            Pad(size=(10, 10), padding_mode='edg')\n\n        data_info = dict(\n            img=np.random.random((1333, 800, 3)),\n            gt_seg_map=np.random.random((1333, 800, 3)),\n            gt_bboxes=np.array([[0, 0, 112, 112]]),\n            gt_keypoints=np.array([[[20, 50, 1]]]))\n\n        # test pad img / gt_seg_map with size\n        trans = Pad(size=(1200, 2000))\n        results = trans(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (2000, 1200)\n        assert results['gt_seg_map'].shape[:2] == (2000, 1200)\n\n        # test pad img/gt_seg_map with size_divisor\n        trans = Pad(size_divisor=11)\n        results = trans(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (1342, 803)\n        assert results['gt_seg_map'].shape[:2] == (1342, 803)\n\n        # test pad img/gt_seg_map with pad_to_square\n        trans = Pad(pad_to_square=True)\n        results = trans(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (1333, 1333)\n        assert results['gt_seg_map'].shape[:2] == (1333, 1333)\n\n        # test pad img/gt_seg_map with pad_to_square and size_divisor\n        trans = Pad(pad_to_square=True, size_divisor=11)\n        results = trans(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (1342, 1342)\n        assert results['gt_seg_map'].shape[:2] == (1342, 1342)\n\n        # test pad img/gt_seg_map with pad_to_square and size_divisor\n        trans = Pad(pad_to_square=True, size_divisor=11)\n        results = trans(copy.deepcopy(data_info))\n        assert results['img'].shape[:2] == (1342, 1342)\n        assert results['gt_seg_map'].shape[:2] == (1342, 1342)\n\n        # test padding_mode\n        new_img = np.ones((1333, 800, 3))\n        data_info['img'] = new_img\n        trans = Pad(pad_to_square=True, padding_mode='edge')\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'] == np.ones((1333, 1333, 3))).all()\n\n        # test pad_val is dict\n        # test rgb image, size=(2000, 2000)\n        trans = Pad(\n            size=(2000, 2000),\n            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][1333:2000, 800:2000, :] == 12).all()\n        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 10).all()\n\n        trans = Pad(size=(2000, 2000), pad_val=dict(img=(12, 12, 12)))\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][1333:2000, 800:2000, :] == 12).all()\n        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()\n\n        # test rgb image, pad_to_square=True\n        trans = Pad(\n            pad_to_square=True,\n            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][:, 800:1333, :] == 12).all()\n        assert (results['gt_seg_map'][:, 800:1333, :] == 10).all()\n\n        trans = Pad(pad_to_square=True, pad_val=dict(img=(12, 12, 12)))\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][:, 800:1333, :] == 12).all()\n        assert (results['gt_seg_map'][:, 800:1333, :] == 255).all()\n\n        # test pad_val is int\n        # test rgb image\n        trans = Pad(size=(2000, 2000), pad_val=12)\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][1333:2000, 800:2000, :] == 12).all()\n        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()\n        # test gray image\n        new_img = np.random.random((1333, 800))\n        data_info['img'] = new_img\n        new_semantic_seg = np.random.random((1333, 800))\n        data_info['gt_seg_map'] = new_semantic_seg\n        trans = Pad(size=(2000, 2000), pad_val=12)\n        results = trans(copy.deepcopy(data_info))\n        assert (results['img'][1333:2000, 800:2000] == 12).all()\n        assert (results['gt_seg_map'][1333:2000, 800:2000] == 255).all()\n\n    def test_repr(self):\n        trans = Pad(pad_to_square=True, size_divisor=11, padding_mode='edge')\n        assert repr(trans) == (\n            'Pad(size=None, size_divisor=11, pad_to_square=True, '\n            \"pad_val={'img': 0, 'seg': 255}), padding_mode=edge)\")\n\n\nclass TestCenterCrop:\n\n    @classmethod\n    def setup_class(cls):\n        img = mmcv.imread(\n            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')\n        cls.original_img = copy.deepcopy(img)\n        seg = np.random.randint(0, 19, (300, 400)).astype(np.uint8)\n        cls.gt_semantic_map = copy.deepcopy(seg)\n\n    @staticmethod\n    def reset_results(results, original_img, gt_semantic_map):\n        results['img'] = copy.deepcopy(original_img)\n        results['gt_seg_map'] = copy.deepcopy(gt_semantic_map)\n        results['gt_bboxes'] = np.array([[0, 0, 210, 160],\n                                         [200, 150, 400, 300]])\n        results['gt_keypoints'] = np.array([[[20, 50, 1]], [[200, 150, 1]],\n                                            [[300, 225, 1]]])\n        return results\n\n    @pytest.mark.skipif(\n        condition=torch is None, reason='No torch in current env')\n    def test_error(self):\n        # test assertion if size is smaller than 0\n        with pytest.raises(AssertionError):\n            transform = dict(type='CenterCrop', crop_size=-1)\n            TRANSFORMS.build(transform)\n\n        # test assertion if size is tuple but one value is smaller than 0\n        with pytest.raises(AssertionError):\n            transform = dict(type='CenterCrop', crop_size=(224, -1))\n            TRANSFORMS.build(transform)\n\n        # test assertion if size is tuple and len(size) < 2\n        with pytest.raises(AssertionError):\n            transform = dict(type='CenterCrop', crop_size=(224, ))\n            TRANSFORMS.build(transform)\n\n        # test assertion if size is tuple len(size) > 2\n        with pytest.raises(AssertionError):\n            transform = dict(type='CenterCrop', crop_size=(224, 224, 3))\n            TRANSFORMS.build(transform)\n\n    def test_repr(self):\n        # test repr\n        transform = dict(type='CenterCrop', crop_size=224)\n        center_crop_module = TRANSFORMS.build(transform)\n        assert isinstance(repr(center_crop_module), str)\n\n    def test_transform(self):\n        results = {}\n        self.reset_results(results, self.original_img, self.gt_semantic_map)\n\n        # test CenterCrop when size is int\n        transform = dict(type='CenterCrop', crop_size=224)\n        center_crop_module = TRANSFORMS.build(transform)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (224, 224)\n        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,\n                                                              88:312]).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 122, 122], [112, 112, 224,\n                                                     224]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()\n\n        # test CenterCrop when size is tuple\n        transform = dict(type='CenterCrop', crop_size=(224, 224))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (224, 224)\n        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,\n                                                              88:312]).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 122, 122], [112, 112, 224,\n                                                     224]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()\n\n        # test CenterCrop when crop_height != crop_width\n        transform = dict(type='CenterCrop', crop_size=(224, 256))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (256, 224)\n        assert (results['img'] == self.original_img[22:278, 88:312, ...]).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map[22:278,\n                                                              88:312]).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 122, 138], [112, 128, 224,\n                                                     256]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 28, 0]], [[112, 128, 1]], [[212, 203, 1]]])).all()\n\n        # test CenterCrop when crop_size is equal to img.shape\n        img_height, img_width, _ = self.original_img.shape\n        transform = dict(type='CenterCrop', crop_size=(img_width, img_height))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (300, 400)\n        assert (results['img'] == self.original_img).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 210, 160], [200, 150, 400,\n                                                     300]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()\n\n        # test CenterCrop when crop_size is larger than img.shape\n        transform = dict(\n            type='CenterCrop', crop_size=(img_width * 2, img_height * 2))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (300, 400)\n        assert (results['img'] == self.original_img).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 210, 160], [200, 150, 400,\n                                                     300]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()\n\n        # test with padding\n        transform = dict(\n            type='CenterCrop',\n            crop_size=(img_width // 2, img_height * 2),\n            auto_pad=True,\n            pad_cfg=dict(type='Pad', padding_mode='constant', pad_val=12))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (600, 200)\n        assert results['img'].shape[:2] == results['gt_seg_map'].shape\n        assert (results['img'][300:600, 100:300, ...] == 12).all()\n        assert (results['gt_seg_map'][300:600, 100:300] == 255).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 110, 160], [100, 150, 200,\n                                                     300]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()\n\n        transform = dict(\n            type='CenterCrop',\n            crop_size=(img_width // 2, img_height * 2),\n            auto_pad=True,\n            pad_cfg=dict(\n                type='Pad',\n                padding_mode='constant',\n                pad_val=dict(img=13, seg=33)))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (600, 200)\n        assert (results['img'][300:600, 100:300, ...] == 13).all()\n        assert (results['gt_seg_map'][300:600, 100:300] == 33).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 110, 160], [100, 150, 200,\n                                                     300]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()\n\n        # test CenterCrop when crop_width is smaller than img_width\n        transform = dict(\n            type='CenterCrop', crop_size=(img_width // 2, img_height))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (img_height, img_width // 2)\n        assert (results['img'] == self.original_img[:, 100:300, ...]).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map[:,\n                                                              100:300]).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 110, 160], [100, 150, 200,\n                                                     300]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()\n\n        # test CenterCrop when crop_height is smaller than img_height\n        transform = dict(\n            type='CenterCrop', crop_size=(img_width, img_height // 2))\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        assert results['img_shape'] == (img_height // 2, img_width)\n        assert (results['img'] == self.original_img[75:225, ...]).all()\n        assert (results['gt_seg_map'] == self.gt_semantic_map[75:225,\n                                                              ...]).all()\n        assert np.equal(results['gt_bboxes'],\n                        np.array([[0, 0, 210, 85], [200, 75, 400,\n                                                    150]])).all()\n        assert np.equal(\n            results['gt_keypoints'],\n            np.array([[[20, 0, 0]], [[200, 75, 1]], [[300, 150, 0]]])).all()\n\n    @pytest.mark.skipif(\n        condition=torch is None, reason='No torch in current env')\n    def test_torchvision_compare(self):\n        # compare results with torchvision\n        results = {}\n        transform = dict(type='CenterCrop', crop_size=224)\n        center_crop_module = TRANSFORMS.build(transform)\n        results = self.reset_results(results, self.original_img,\n                                     self.gt_semantic_map)\n        results = center_crop_module(results)\n        center_crop_module = torchvision.transforms.CenterCrop(size=224)\n        pil_img = Image.fromarray(self.original_img)\n        pil_seg = Image.fromarray(self.gt_semantic_map)\n        cropped_img = center_crop_module(pil_img)\n        cropped_img = np.array(cropped_img)\n        cropped_seg = center_crop_module(pil_seg)\n        cropped_seg = np.array(cropped_seg)\n        assert np.equal(results['img'], cropped_img).all()\n        assert np.equal(results['gt_seg_map'], cropped_seg).all()\n\n\nclass TestRandomGrayscale:\n\n    @classmethod\n    def setup_class(cls):\n        cls.img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)\n\n    def test_repr(self):\n        # test repr\n        transform = dict(\n            type='RandomGrayscale',\n            prob=1.,\n            channel_weights=(0.299, 0.587, 0.114),\n            keep_channels=True)\n        random_gray_scale_module = TRANSFORMS.build(transform)\n        assert isinstance(repr(random_gray_scale_module), str)\n\n    def test_error(self):\n        # test invalid argument\n        transform = dict(type='RandomGrayscale', prob=2)\n        with pytest.raises(AssertionError):\n            TRANSFORMS.build(transform)\n\n    def test_transform(self):\n        results = dict()\n        # test rgb2gray, return the grayscale image with prob = 1.\n        transform = dict(\n            type='RandomGrayscale',\n            prob=1.,\n            channel_weights=(0.299, 0.587, 0.114),\n            keep_channels=True)\n\n        random_gray_scale_module = TRANSFORMS.build(transform)\n        results['img'] = copy.deepcopy(self.img)\n        img = random_gray_scale_module(results)['img']\n        computed_gray = (self.img[:, :, 0] * 0.299 +\n                         self.img[:, :, 1] * 0.587 +\n                         self.img[:, :, 2] * 0.114).astype(np.uint8)\n        for i in range(img.shape[2]):\n            assert_array_almost_equal(img[:, :, i], computed_gray, decimal=4)\n        assert img.shape == (10, 10, 3)\n\n        # test rgb2gray, return the original image with p=0.\n        transform = dict(type='RandomGrayscale', prob=0.)\n        random_gray_scale_module = TRANSFORMS.build(transform)\n        results['img'] = copy.deepcopy(self.img)\n        img = random_gray_scale_module(results)['img']\n        assert_array_equal(img, self.img)\n        assert img.shape == (10, 10, 3)\n\n        # test image with one channel\n        transform = dict(type='RandomGrayscale', prob=1.)\n        results['img'] = self.img[:, :, 0:1]\n        random_gray_scale_module = TRANSFORMS.build(transform)\n        img = random_gray_scale_module(results)['img']\n        assert_array_equal(img, self.img[:, :, 0:1])\n        assert img.shape == (10, 10, 1)\n\n\n@TRANSFORMS.register_module()\nclass MockPackTaskInputs(BaseTransform):\n\n    def __init__(self) -> None:\n        super().__init__()\n\n    def transform(self, results):\n        packed_results = dict(inputs=results['img'], data_sample=Mock())\n        return packed_results\n\n\nclass TestMultiScaleFlipAug:\n\n    @classmethod\n    def setup_class(cls):\n        cls.img = mmcv.imread(\n            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')\n        cls.original_img = copy.deepcopy(cls.img)\n\n    def test_error(self):\n        # test assertion if scales is not tuple or list of tuple\n        with pytest.raises(AssertionError):\n            transform = dict(\n                type='MultiScaleFlipAug', scales=[1333, 800], transforms=[])\n            TRANSFORMS.build(transform)\n\n        # test assertion if flip_direction is not str or list of str\n        with pytest.raises(AssertionError):\n            transform = dict(\n                type='MultiScaleFlipAug',\n                scales=[(1333, 800)],\n                flip_direction=1,\n                transforms=[])\n            TRANSFORMS.build(transform)\n\n    @pytest.mark.skipif(\n        condition=torch is None, reason='No torch in current env')\n    def test_multi_scale_flip_aug(self):\n        # test with empty transforms\n        transform = dict(\n            type='MultiScaleFlipAug',\n            transforms=[dict(type='MockPackTaskInputs')],\n            scales=[(1333, 800), (800, 600), (640, 480)],\n            allow_flip=True,\n            flip_direction=['horizontal', 'vertical', 'diagonal'])\n        multi_scale_flip_aug_module = TRANSFORMS.build(transform)\n        results = dict()\n        results['img'] = copy.deepcopy(self.original_img)\n        packed_results = multi_scale_flip_aug_module(results)\n        assert len(packed_results['inputs']) == 12\n\n        # test with allow_flip=False\n        transform = dict(\n            type='MultiScaleFlipAug',\n            transforms=[dict(type='MockPackTaskInputs')],\n            scales=[(1333, 800), (800, 600), (640, 480)],\n            allow_flip=False,\n            flip_direction=['horizontal', 'vertical', 'diagonal'])\n        multi_scale_flip_aug_module = TRANSFORMS.build(transform)\n        results = dict()\n        results['img'] = copy.deepcopy(self.original_img)\n        packed_results = multi_scale_flip_aug_module(results)\n        assert len(packed_results['inputs']) == 3\n\n        # test with transforms\n        img_norm_cfg = dict(\n            mean=[123.675, 116.28, 103.53],\n            std=[58.395, 57.12, 57.375],\n            to_rgb=True)\n        transforms_cfg = [\n            dict(type='Normalize', **img_norm_cfg),\n            dict(type='Pad', size_divisor=32),\n            dict(type='ImageToTensor', keys=['img']),\n            dict(type='MockPackTaskInputs')\n        ]\n        transform = dict(\n            type='MultiScaleFlipAug',\n            transforms=transforms_cfg,\n            scales=[(1333, 800), (800, 600), (640, 480)],\n            allow_flip=True,\n            flip_direction=['horizontal', 'vertical', 'diagonal'])\n        multi_scale_flip_aug_module = TRANSFORMS.build(transform)\n        results = dict()\n        results['img'] = copy.deepcopy(self.original_img)\n        packed_results = multi_scale_flip_aug_module(results)\n        assert len(packed_results['inputs']) == 12\n\n        # test with scale_factor\n        img_norm_cfg = dict(\n            mean=[123.675, 116.28, 103.53],\n            std=[58.395, 57.12, 57.375],\n            to_rgb=True)\n        transforms_cfg = [\n            dict(type='Normalize', **img_norm_cfg),\n            dict(type='Pad', size_divisor=32),\n            dict(type='ImageToTensor', keys=['img']),\n            dict(type='MockPackTaskInputs')\n        ]\n        transform = dict(\n            type='MultiScaleFlipAug',\n            transforms=transforms_cfg,\n            scale_factor=[0.5, 1., 2.],\n            allow_flip=True,\n            flip_direction=['horizontal', 'vertical', 'diagonal'])\n        multi_scale_flip_aug_module = TRANSFORMS.build(transform)\n        results = dict()\n        results['img'] = copy.deepcopy(self.original_img)\n        packed_results = multi_scale_flip_aug_module(results)\n        assert len(packed_results['inputs']) == 12\n\n        # test no resize\n        img_norm_cfg = dict(\n            mean=[123.675, 116.28, 103.53],\n            std=[58.395, 57.12, 57.375],\n            to_rgb=True)\n        transforms_cfg = [\n            dict(type='Normalize', **img_norm_cfg),\n            dict(type='Pad', size_divisor=32),\n            dict(type='ImageToTensor', keys=['img']),\n            dict(type='MockPackTaskInputs')\n        ]\n        transform = dict(\n            type='MultiScaleFlipAug',\n            transforms=transforms_cfg,\n            allow_flip=True,\n            flip_direction=['horizontal', 'vertical', 'diagonal'])\n        multi_scale_flip_aug_module = TRANSFORMS.build(transform)\n        results = dict()\n        results['img'] = copy.deepcopy(self.original_img)\n        packed_results = multi_scale_flip_aug_module(results)\n        assert len(packed_results['inputs']) == 4\n\n\nclass TestRandomChoiceResize:\n\n    @classmethod\n    def setup_class(cls):\n        cls.img = mmcv.imread(\n            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')\n        cls.original_img = copy.deepcopy(cls.img)\n\n    def reset_results(self, results):\n        results['img'] = copy.deepcopy(self.original_img)\n        results['gt_seg_map'] = copy.deepcopy(self.original_img)\n\n    def test_repr(self):\n        # test repr\n        transform = dict(\n            type='RandomChoiceResize', scales=[(1333, 800), (1333, 600)])\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        assert isinstance(repr(random_multiscale_resize), str)\n\n    def test_error(self):\n        # test assertion if size is smaller than 0\n        with pytest.raises(AssertionError):\n            transform = dict(type='RandomChoiceResize', scales=[0.5, 1, 2])\n            TRANSFORMS.build(transform)\n\n    def test_random_multiscale_resize(self):\n        results = dict()\n        # test with one scale\n        transform = dict(type='RandomChoiceResize', scales=[(1333, 800)])\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        self.reset_results(results)\n        results = random_multiscale_resize(results)\n        assert results['img'].shape == (800, 1333, 3)\n\n        # test with multi scales\n        _scale_choice = [(1333, 800), (1333, 600)]\n        transform = dict(type='RandomChoiceResize', scales=_scale_choice)\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        self.reset_results(results)\n        results = random_multiscale_resize(results)\n        assert (results['img'].shape[1],\n                results['img'].shape[0]) in _scale_choice\n\n        # test keep_ratio\n        transform = dict(\n            type='RandomChoiceResize',\n            scales=[(900, 600)],\n            resize_type='Resize',\n            keep_ratio=True)\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        self.reset_results(results)\n        _input_ratio = results['img'].shape[0] / results['img'].shape[1]\n        results = random_multiscale_resize(results)\n        _output_ratio = results['img'].shape[0] / results['img'].shape[1]\n        assert_array_almost_equal(_input_ratio, _output_ratio)\n\n        # test clip_object_border\n        gt_bboxes = [[200, 150, 600, 450]]\n        transform = dict(\n            type='RandomChoiceResize',\n            scales=[(200, 150)],\n            resize_type='Resize',\n            clip_object_border=True)\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        self.reset_results(results)\n        results['gt_bboxes'] = np.array(gt_bboxes)\n        results = random_multiscale_resize(results)\n        assert results['img'].shape == (150, 200, 3)\n        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 200,\n                                                         150]])).all()\n\n        transform = dict(\n            type='RandomChoiceResize',\n            scales=[(200, 150)],\n            resize_type='Resize',\n            clip_object_border=False)\n        random_multiscale_resize = TRANSFORMS.build(transform)\n        self.reset_results(results)\n        results['gt_bboxes'] = np.array(gt_bboxes)\n        results = random_multiscale_resize(results)\n        assert results['img'].shape == (150, 200, 3)\n        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 300,\n                                                         225]])).all()\n\n\nclass TestRandomFlip:\n\n    def test_init(self):\n\n        # prob is float\n        TRANSFORMS = RandomFlip(0.1)\n        assert TRANSFORMS.prob == 0.1\n\n        # prob is None\n        with pytest.raises(ValueError):\n            TRANSFORMS = RandomFlip(None)\n            assert TRANSFORMS.prob is None\n\n        # prob is a list\n        TRANSFORMS = RandomFlip([0.1, 0.2], ['horizontal', 'vertical'])\n        assert len(TRANSFORMS.prob) == 2\n        assert len(TRANSFORMS.direction) == 2\n\n        # direction is an invalid type\n        with pytest.raises(ValueError):\n            TRANSFORMS = RandomFlip(0.1, 1)\n\n        # prob is an invalid type\n        with pytest.raises(ValueError):\n            TRANSFORMS = RandomFlip('0.1')\n\n    def test_transform(self):\n\n        results = {\n            'img': np.random.random((224, 224, 3)),\n            'gt_bboxes': np.array([[0, 1, 100, 101]]),\n            'gt_keypoints': np.array([[[100, 100, 1.0]]]),\n            # seg map flip is irrelative with image, so there is no requirement\n            # that gt_set_map of test data matches image.\n            'gt_seg_map': np.array([[0, 1], [2, 3]])\n        }\n\n        # horizontal flip\n        TRANSFORMS = RandomFlip([1.0], ['horizontal'])\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,\n                                                          101]])).all()\n        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,\n                                                                   2]])).all()\n\n        # diagonal flip\n        TRANSFORMS = RandomFlip([1.0], ['diagonal'])\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_bboxes'] == np.array([[124, 123, 224,\n                                                          223]])).all()\n        assert (results_update['gt_seg_map'] == np.array([[3, 2], [1,\n                                                                   0]])).all()\n\n        # vertical flip\n        TRANSFORMS = RandomFlip([1.0], ['vertical'])\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_bboxes'] == np.array([[0, 123, 100,\n                                                          223]])).all()\n        assert (results_update['gt_seg_map'] == np.array([[2, 3], [0,\n                                                                   1]])).all()\n\n        # horizontal flip when direction is None\n        TRANSFORMS = RandomFlip(1.0)\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,\n                                                          101]])).all()\n        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,\n                                                                   2]])).all()\n\n        # horizontal flip and swap label pair\n        TRANSFORMS = RandomFlip([1.0], ['horizontal'],\n                                swap_seg_labels=[[0, 1]])\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_seg_map'] == np.array([[0, 1], [3,\n                                                                   2]])).all()\n        assert results_update['swap_seg_labels'] == [[0, 1]]\n\n        TRANSFORMS = RandomFlip(0.0)\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert (results_update['gt_bboxes'] == np.array([[0, 1, 100,\n                                                          101]])).all()\n        assert (results_update['gt_seg_map'] == np.array([[0, 1], [2,\n                                                                   3]])).all()\n\n        # flip direction is invalid in bbox flip\n        with pytest.raises(ValueError):\n            TRANSFORMS = RandomFlip(1.0)\n            results_update = TRANSFORMS._flip_bbox(results['gt_bboxes'],\n                                                   (224, 224), 'invalid')\n\n        # flip direction is invalid in keypoints flip\n        with pytest.raises(ValueError):\n            TRANSFORMS = RandomFlip(1.0)\n            results_update = TRANSFORMS._flip_keypoints(\n                results['gt_keypoints'], (224, 224), 'invalid')\n\n        # swap pair is invalid\n        with pytest.raises(AssertionError):\n            TRANSFORMS = RandomFlip(1.0, swap_seg_labels='invalid')\n            results_update = TRANSFORMS._flip_seg_map(results['gt_seg_map'],\n                                                      'horizontal')\n\n    def test_repr(self):\n        TRANSFORMS = RandomFlip(0.1)\n        TRANSFORMS_str = str(TRANSFORMS)\n        assert isinstance(TRANSFORMS_str, str)\n\n\nclass TestRandomResize:\n\n    def test_init(self):\n        TRANSFORMS = RandomResize(\n            (224, 224),\n            (1.0, 2.0),\n        )\n        assert TRANSFORMS.scale == (224, 224)\n\n    def test_repr(self):\n        TRANSFORMS = RandomResize(\n            (224, 224),\n            (1.0, 2.0),\n        )\n        TRANSFORMS_str = str(TRANSFORMS)\n        assert isinstance(TRANSFORMS_str, str)\n\n    def test_transform(self):\n\n        # choose target scale from init when override is True\n        results = {}\n        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0))\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert results_update['scale'][0] >= 224 and results_update['scale'][\n            0] <= 448\n        assert results_update['scale'][1] >= 224 and results_update['scale'][\n            1] <= 448\n\n        # keep ratio is True\n        results = {\n            'img': np.random.random((224, 224, 3)),\n            'gt_seg_map': np.random.random((224, 224, 3)),\n            'gt_bboxes': np.array([[0, 0, 112, 112]]),\n            'gt_keypoints': np.array([[[112, 112]]])\n        }\n\n        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),\n                                  resize_type='Resize',\n                                  keep_ratio=True)\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert 224 <= results_update['img_shape'][0]\n        assert 448 >= results_update['img_shape'][0]\n        assert 224 <= results_update['img_shape'][1]\n        assert 448 >= results_update['img_shape'][1]\n        assert results_update['keep_ratio']\n        assert results['gt_bboxes'][0][2] >= 112\n        assert results['gt_bboxes'][0][2] <= 112\n\n        # keep ratio is False\n        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),\n                                  resize_type='Resize',\n                                  keep_ratio=False)\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n\n        # choose target scale from init when override is False and scale is a\n        # list of tuples\n        results = {}\n        TRANSFORMS = RandomResize([(224, 448), (112, 224)],\n                                  resize_type='Resize',\n                                  keep_ratio=True)\n        results_update = TRANSFORMS.transform(copy.deepcopy(results))\n        assert results_update['scale'][1] >= 224 and results_update['scale'][\n            1] <= 448\n        assert results_update['scale'][0] >= 112 and results_update['scale'][\n            0] <= 224\n\n        # the type of scale is invalid in init\n        with pytest.raises(NotImplementedError):\n            results = {}\n            TRANSFORMS = RandomResize([(224, 448), [112, 224]],\n                                      resize_type='Resize',\n                                      keep_ratio=True)\n            results_update = TRANSFORMS.transform(copy.deepcopy(results))\n\n\nclass TestTestTimeAug:\n\n    def test_init(self):\n        subroutines = [[\n            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n            dict(type='Resize', scale=(1333, 400), keep_ratio=True)\n        ], [\n            dict(type='RandomFlip', prob=1.),\n            dict(type='RandomFlip', prob=0.)\n        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]\n\n        tta_transform = TestTimeAug(subroutines)\n        subroutines = tta_transform.subroutines\n        assert len(subroutines) == 4\n\n        assert isinstance(subroutines[0].transforms[0], Resize)\n        assert isinstance(subroutines[0].transforms[1], RandomFlip)\n        assert isinstance(subroutines[0].transforms[2], Normalize)\n        assert isinstance(subroutines[1].transforms[0], Resize)\n        assert isinstance(subroutines[1].transforms[1], RandomFlip)\n        assert isinstance(subroutines[1].transforms[2], Normalize)\n\n    def test_transform(self):\n        results = {\n            'img': np.random.random((224, 224, 3)),\n            'gt_bboxes': np.array([[0, 1, 100, 101]]),\n            'gt_keypoints': np.array([[[100, 100, 1.0]]]),\n            'gt_seg_map': np.random.random((224, 224, 3))\n        }\n        input_results = copy.deepcopy(results)\n        transforms = [[\n            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n            dict(type='Resize', scale=(1333, 400), keep_ratio=True)\n        ], [\n            dict(type='RandomFlip', prob=0.),\n            dict(type='RandomFlip', prob=1.)\n        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]\n\n        tta_transform = TestTimeAug(transforms)\n        results = tta_transform.transform(results)\n        assert len(results['img']) == 4\n\n        resize1 = tta_transform.subroutines[0].transforms[0]\n        resize2 = tta_transform.subroutines[2].transforms[0]\n        flip1 = tta_transform.subroutines[0].transforms[1]\n        flip2 = tta_transform.subroutines[1].transforms[1]\n        normalize = tta_transform.subroutines[0].transforms[2]\n        target_results = [\n            normalize.transform(\n                flip1.transform(\n                    resize1.transform(copy.deepcopy(input_results)))),\n            normalize.transform(\n                flip2.transform(\n                    resize1.transform(copy.deepcopy(input_results)))),\n            normalize.transform(\n                flip1.transform(\n                    resize2.transform(copy.deepcopy(input_results)))),\n            normalize.transform(\n                flip2.transform(\n                    resize2.transform(copy.deepcopy(input_results)))),\n        ]\n\n        assert np.allclose(target_results[0]['img'], results['img'][0])\n        assert np.allclose(target_results[1]['img'], results['img'][1])\n        assert np.allclose(target_results[2]['img'], results['img'][2])\n        assert np.allclose(target_results[3]['img'], results['img'][3])\n\n    def test_repr(self):\n        transforms = [[\n            dict(type='Resize', scale=(1333, 800), keep_ratio=True),\n            dict(type='Resize', scale=(1333, 400), keep_ratio=True)\n        ], [\n            dict(type='RandomFlip', prob=0.),\n            dict(type='RandomFlip', prob=1.)\n        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]\n\n        tta_transform = TestTimeAug(transforms)\n        repr_str = repr(tta_transform)\n        repr_str_list = repr_str.split('\\n')\n        assert repr_str_list[0] == 'TestTimeAugtransforms='\n        assert repr_str_list[1] == 'Compose('\n        assert repr_str_list[2].startswith('    Resize(scale=(1333, 800)')\n        assert repr_str_list[3].startswith('    RandomFlip(prob=0.0')\n        assert repr_str_list[4].startswith('    Normalize(mean=[0. 0. 0.]')\n"
  },
  {
    "path": "tests/test_transforms/test_transforms_wrapper.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport warnings\n\nimport numpy as np\nimport pytest\n\nfrom mmcv.transforms.base import BaseTransform\nfrom mmcv.transforms.builder import TRANSFORMS\nfrom mmcv.transforms.utils import (avoid_cache_randomness, cache_random_params,\n                                   cache_randomness)\nfrom mmcv.transforms.wrappers import (Compose, KeyMapper, RandomApply,\n                                      RandomChoice, TransformBroadcaster)\n\n\n@TRANSFORMS.register_module()\nclass AddToValue(BaseTransform):\n    \"\"\"Dummy transform to add a given addend to results['value']\"\"\"\n\n    def __init__(self, addend=0) -> None:\n        super().__init__()\n        self.addend = addend\n\n    def add(self, results, addend):\n        augend = results['value']\n\n        if isinstance(augend, list):\n            warnings.warn('value is a list', UserWarning)\n        if isinstance(augend, dict):\n            warnings.warn('value is a dict', UserWarning)\n\n        def _add_to_value(augend, addend):\n            if isinstance(augend, list):\n                return [_add_to_value(v, addend) for v in augend]\n            if isinstance(augend, dict):\n                return {k: _add_to_value(v, addend) for k, v in augend.items()}\n            return augend + addend\n\n        results['value'] = _add_to_value(results['value'], addend)\n        return results\n\n    def transform(self, results):\n        return self.add(results, self.addend)\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'addend = {self.addend}'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass RandomAddToValue(AddToValue):\n    \"\"\"Dummy transform to add a random addend to results['value']\"\"\"\n\n    def __init__(self, repeat=1) -> None:\n        super().__init__(addend=None)\n        self.repeat = repeat\n\n    @cache_randomness\n    def get_random_addend(self):\n        return np.random.rand()\n\n    def transform(self, results):\n        for _ in range(self.repeat):\n            results = self.add(results, addend=self.get_random_addend())\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        repr_str += f'repeat = {self.repeat}'\n        return repr_str\n\n\n@TRANSFORMS.register_module()\nclass SumTwoValues(BaseTransform):\n    \"\"\"Dummy transform to test transform wrappers.\"\"\"\n\n    def transform(self, results):\n        if 'num_1' in results and 'num_2' in results:\n            results['sum'] = results['num_1'] + results['num_2']\n        elif 'num_1' in results:\n            results['sum'] = results['num_1']\n        elif 'num_2' in results:\n            results['sum'] = results['num_2']\n        else:\n            results['sum'] = np.nan\n        return results\n\n    def __repr__(self) -> str:\n        repr_str = self.__class__.__name__\n        return repr_str\n\n\ndef test_compose():\n\n    # Case 1: build from cfg\n    pipeline = [dict(type='AddToValue')]\n    pipeline = Compose(pipeline)\n    _ = str(pipeline)\n\n    # Case 2: build from transform list\n    pipeline = [AddToValue()]\n    pipeline = Compose(pipeline)\n\n    # Case 3: invalid build arguments\n    pipeline = [[dict(type='AddToValue')]]\n    with pytest.raises(TypeError):\n        pipeline = Compose(pipeline)\n\n    # Case 4: contain transform with None output\n    class DummyTransform(BaseTransform):\n\n        def transform(self, results):\n            return None\n\n    pipeline = Compose([DummyTransform()])\n    results = pipeline({})\n    assert results is None\n\n\ndef test_cache_random_parameters():\n\n    transform = RandomAddToValue()\n\n    # Case 1: cache random parameters\n    assert hasattr(RandomAddToValue, '_methods_with_randomness')\n    assert 'get_random_addend' in RandomAddToValue._methods_with_randomness\n\n    with cache_random_params(transform):\n        results_1 = transform(dict(value=0))\n        results_2 = transform(dict(value=0))\n        np.testing.assert_equal(results_1['value'], results_2['value'])\n\n    # Case 2: do not cache random parameters\n    results_1 = transform(dict(value=0))\n    results_2 = transform(dict(value=0))\n    with pytest.raises(AssertionError):\n        np.testing.assert_equal(results_1['value'], results_2['value'])\n\n    # Case 3: allow to invoke random method 0 times\n    transform = RandomAddToValue(repeat=0)\n    with cache_random_params(transform):\n        _ = transform(dict(value=0))\n\n    # Case 4: NOT allow to invoke random method >1 times\n    transform = RandomAddToValue(repeat=2)\n    with pytest.raises(RuntimeError):\n        with cache_random_params(transform):\n            _ = transform(dict(value=0))\n\n    # Case 5: apply on nested transforms\n    transform = Compose([RandomAddToValue()])\n    with cache_random_params(transform):\n        results_1 = transform(dict(value=0))\n        results_2 = transform(dict(value=0))\n        np.testing.assert_equal(results_1['value'], results_2['value'])\n\n\ndef test_key_mapper():\n    # Case 0: only remap\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=1)], remapping={'value': 'v_out'})\n\n    results = dict(value=0)\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)  # should be unchanged\n    np.testing.assert_equal(results['v_out'], 1)\n\n    # Case 1: simple remap\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=1)],\n        mapping={'value': 'v_in'},\n        remapping={'value': 'v_out'})\n\n    results = dict(value=0, v_in=1)\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)  # should be unchanged\n    np.testing.assert_equal(results['v_in'], 1)\n    np.testing.assert_equal(results['v_out'], 2)\n\n    # Case 2: collecting list\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=2)],\n        mapping={'value': ['v_in_1', 'v_in_2']},\n        remapping={'value': ['v_out_1', 'v_out_2']})\n    results = dict(value=0, v_in_1=1, v_in_2=2)\n\n    with pytest.warns(UserWarning, match='value is a list'):\n        results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)  # should be unchanged\n    np.testing.assert_equal(results['v_in_1'], 1)\n    np.testing.assert_equal(results['v_in_2'], 2)\n    np.testing.assert_equal(results['v_out_1'], 3)\n    np.testing.assert_equal(results['v_out_2'], 4)\n\n    # Case 3: collecting dict\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=2)],\n        mapping={'value': {\n            'v1': 'v_in_1',\n            'v2': 'v_in_2'\n        }},\n        remapping={'value': {\n            'v1': 'v_out_1',\n            'v2': 'v_out_2'\n        }})\n    results = dict(value=0, v_in_1=1, v_in_2=2)\n\n    with pytest.warns(UserWarning, match='value is a dict'):\n        results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)  # should be unchanged\n    np.testing.assert_equal(results['v_in_1'], 1)\n    np.testing.assert_equal(results['v_in_2'], 2)\n    np.testing.assert_equal(results['v_out_1'], 3)\n    np.testing.assert_equal(results['v_out_2'], 4)\n\n    # Case 4: collecting list with auto_remap mode\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=2)],\n        mapping=dict(value=['v_in_1', 'v_in_2']),\n        auto_remap=True)\n    results = dict(value=0, v_in_1=1, v_in_2=2)\n\n    with pytest.warns(UserWarning, match='value is a list'):\n        results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)\n    np.testing.assert_equal(results['v_in_1'], 3)\n    np.testing.assert_equal(results['v_in_2'], 4)\n\n    # Case 5: collecting dict with auto_remap mode\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=2)],\n        mapping=dict(value=dict(v1='v_in_1', v2='v_in_2')),\n        auto_remap=True)\n    results = dict(value=0, v_in_1=1, v_in_2=2)\n\n    with pytest.warns(UserWarning, match='value is a dict'):\n        results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)\n    np.testing.assert_equal(results['v_in_1'], 3)\n    np.testing.assert_equal(results['v_in_2'], 4)\n\n    # Case 6: nested collection with auto_remap mode\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=2)],\n        mapping=dict(value=['v1', dict(v2=['v21', 'v22'], v3='v3')]),\n        auto_remap=True)\n    results = dict(value=0, v1=1, v21=2, v22=3, v3=4)\n\n    with pytest.warns(UserWarning, match='value is a list'):\n        results = pipeline(results)\n\n    np.testing.assert_equal(results['value'], 0)\n    np.testing.assert_equal(results['v1'], 3)\n    np.testing.assert_equal(results['v21'], 4)\n    np.testing.assert_equal(results['v22'], 5)\n    np.testing.assert_equal(results['v3'], 6)\n\n    # Case 7: output_map must be None if `auto_remap` is set True\n    with pytest.raises(ValueError):\n        pipeline = KeyMapper(\n            transforms=[AddToValue(addend=1)],\n            mapping=dict(value='v_in'),\n            remapping=dict(value='v_out'),\n            auto_remap=True)\n\n    # Case 8: allow_nonexist_keys8\n    pipeline = KeyMapper(\n        transforms=[SumTwoValues()],\n        mapping=dict(num_1='a', num_2='b'),\n        auto_remap=False,\n        allow_nonexist_keys=True)\n\n    results = pipeline(dict(a=1, b=2))\n    np.testing.assert_equal(results['sum'], 3)\n\n    results = pipeline(dict(a=1))\n    np.testing.assert_equal(results['sum'], 1)\n\n    # Case 9: use wrapper as a transform\n    transform = KeyMapper(mapping=dict(b='a'), auto_remap=False)\n    results = transform(dict(a=1))\n    # note that the original key 'a' will not be removed\n    assert results == dict(a=1, b=1)\n\n    # Case 10: manually set keys ignored\n    pipeline = KeyMapper(\n        transforms=[SumTwoValues()],\n        mapping=dict(num_1='a', num_2=...),  # num_2 (b) will be ignored\n        auto_remap=False,\n        # allow_nonexist_keys will not affect manually ignored keys\n        allow_nonexist_keys=False)\n\n    results = pipeline(dict(a=1, b=2))\n    np.testing.assert_equal(results['sum'], 1)\n\n    # Test basic functions\n    pipeline = KeyMapper(\n        transforms=[AddToValue(addend=1)],\n        mapping=dict(value='v_in'),\n        remapping=dict(value='v_out'))\n\n    # __iter__\n    for _ in pipeline:\n        pass\n\n    # __repr__\n    assert repr(pipeline) == (\n        'KeyMapper(transforms = Compose(\\n    ' + 'AddToValueaddend = 1' +\n        '\\n), mapping = {\\'value\\': \\'v_in\\'}, ' +\n        'remapping = {\\'value\\': \\'v_out\\'}, auto_remap = False, ' +\n        'allow_nonexist_keys = False)')\n\n\ndef test_transform_broadcaster():\n\n    # Case 1: apply to list in results\n    pipeline = TransformBroadcaster(\n        transforms=[AddToValue(addend=1)],\n        mapping=dict(value='values'),\n        auto_remap=True)\n    results = dict(values=[1, 2])\n\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['values'], [2, 3])\n\n    # Case 2: apply to multiple keys\n    pipeline = TransformBroadcaster(\n        transforms=[AddToValue(addend=1)],\n        mapping=dict(value=['v_1', 'v_2']),\n        auto_remap=True)\n    results = dict(v_1=1, v_2=2)\n\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['v_1'], 2)\n    np.testing.assert_equal(results['v_2'], 3)\n\n    # Case 3: apply to multiple groups of keys\n    pipeline = TransformBroadcaster(\n        transforms=[SumTwoValues()],\n        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),\n        remapping=dict(sum=['a', 'b']),\n        auto_remap=False)\n\n    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['a'], 3)\n    np.testing.assert_equal(results['b'], 7)\n\n    # Case 3: apply to all keys\n    pipeline = TransformBroadcaster(\n        transforms=[SumTwoValues()], mapping=None, remapping=None)\n    results = dict(num_1=[1, 2, 3], num_2=[4, 5, 6])\n\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['sum'], [5, 7, 9])\n\n    # Case 4: inconsistent sequence length\n    with pytest.raises(ValueError):\n        pipeline = TransformBroadcaster(\n            transforms=[SumTwoValues()],\n            mapping=dict(num_1='list_1', num_2='list_2'),\n            auto_remap=False)\n\n        results = dict(list_1=[1, 2], list_2=[1, 2, 3])\n        _ = pipeline(results)\n\n    # Case 5: share random parameter\n    pipeline = TransformBroadcaster(\n        transforms=[RandomAddToValue()],\n        mapping=dict(value='values'),\n        auto_remap=True,\n        share_random_params=True)\n\n    results = dict(values=[0, 0])\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['values'][0], results['values'][1])\n\n    # Case 6: partial broadcasting\n    pipeline = TransformBroadcaster(\n        transforms=[SumTwoValues()],\n        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', ...]),\n        remapping=dict(sum=['a', 'b']),\n        auto_remap=False)\n\n    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['a'], 3)\n    np.testing.assert_equal(results['b'], 3)\n\n    pipeline = TransformBroadcaster(\n        transforms=[SumTwoValues()],\n        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),\n        remapping=dict(sum=['a', ...]),\n        auto_remap=False)\n\n    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)\n    results = pipeline(results)\n\n    np.testing.assert_equal(results['a'], 3)\n    assert 'b' not in results\n\n    # Test repr\n    assert repr(pipeline) == (\n        'TransformBroadcaster(transforms = Compose(\\n' + '    SumTwoValues' +\n        '\\n), mapping = {\\'num_1\\': [\\'a_1\\', \\'b_1\\'], ' +\n        '\\'num_2\\': [\\'a_2\\', \\'b_2\\']}, ' +\n        'remapping = {\\'sum\\': [\\'a\\', Ellipsis]}, auto_remap = False, ' +\n        'allow_nonexist_keys = False, share_random_params = False)')\n\n\ndef test_random_choice():\n\n    # Case 1: given probability\n    pipeline = RandomChoice(\n        transforms=[[AddToValue(addend=1.0)], [AddToValue(addend=2.0)]],\n        prob=[1.0, 0.0])\n\n    results = pipeline(dict(value=1))\n    np.testing.assert_equal(results['value'], 2.0)\n\n    # Case 2: default probability\n    pipeline = RandomChoice(transforms=[[AddToValue(\n        addend=1.0)], [AddToValue(addend=2.0)]])\n\n    _ = pipeline(dict(value=1))\n\n    # Case 3: nested RandomChoice in TransformBroadcaster\n    pipeline = TransformBroadcaster(\n        transforms=[\n            RandomChoice(\n                transforms=[[AddToValue(addend=1.0)],\n                            [AddToValue(addend=2.0)]], ),\n        ],\n        mapping={'value': 'values'},\n        auto_remap=True,\n        share_random_params=True)\n\n    results = dict(values=[0 for _ in range(10)])\n    results = pipeline(results)\n    # check share_random_params=True works so that all values are same\n    values = results['values']\n    assert all(map(lambda x: x == values[0], values))\n\n    # repr\n    assert repr(pipeline) == (\n        'TransformBroadcaster(transforms = Compose(\\n' +\n        '    RandomChoice(transforms = [Compose(\\n' +\n        '    AddToValueaddend = 1.0' + '\\n), Compose(\\n' +\n        '    AddToValueaddend = 2.0' + '\\n)]prob = None)' +\n        '\\n), mapping = {\\'value\\': \\'values\\'}, ' +\n        'remapping = {\\'value\\': \\'values\\'}, auto_remap = True, ' +\n        'allow_nonexist_keys = False, share_random_params = True)')\n\n\ndef test_random_apply():\n\n    # Case 1: simple use\n    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=1.0)\n    results = pipeline(dict(value=1))\n    np.testing.assert_equal(results['value'], 2.0)\n\n    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=0.0)\n    results = pipeline(dict(value=1))\n    np.testing.assert_equal(results['value'], 1.0)\n\n    # Case 2: nested RandomApply in TransformBroadcaster\n    pipeline = TransformBroadcaster(\n        transforms=[RandomApply(transforms=[AddToValue(addend=1)], prob=0.5)],\n        mapping={'value': 'values'},\n        auto_remap=True,\n        share_random_params=True)\n\n    results = dict(values=[0 for _ in range(10)])\n    results = pipeline(results)\n    # check share_random_params=True works so that all values are same\n    values = results['values']\n    assert all(map(lambda x: x == values[0], values))\n\n    # __iter__\n    for _ in pipeline:\n        pass\n\n    # repr\n    assert repr(pipeline) == (\n        'TransformBroadcaster(transforms = Compose(\\n' +\n        '    RandomApply(transforms = Compose(\\n' +\n        '    AddToValueaddend = 1' + '\\n), prob = 0.5)' +\n        '\\n), mapping = {\\'value\\': \\'values\\'}, ' +\n        'remapping = {\\'value\\': \\'values\\'}, auto_remap = True, ' +\n        'allow_nonexist_keys = False, share_random_params = True)')\n\n\ndef test_utils():\n    # Test cache_randomness: normal case\n    class DummyTransform(BaseTransform):\n\n        @cache_randomness\n        def func(self):\n            return np.random.rand()\n\n        def transform(self, results):\n            _ = self.func()\n            return results\n\n    transform = DummyTransform()\n    _ = transform({})\n    with cache_random_params(transform):\n        _ = transform({})\n\n    # Test cache_randomness: invalid function type\n    with pytest.raises(TypeError):\n\n        class DummyTransform(BaseTransform):\n\n            @cache_randomness\n            @staticmethod\n            def func():\n                return np.random.rand()\n\n            def transform(self, results):\n                return results\n\n    # Test cache_randomness: invalid function argument list\n    with pytest.raises(TypeError):\n\n        class DummyTransform(BaseTransform):\n\n            @cache_randomness\n            def func(cls):\n                return np.random.rand()\n\n            def transform(self, results):\n                return results\n\n    # Test avoid_cache_randomness: invalid mixture with cache_randomness\n    with pytest.raises(RuntimeError):\n\n        @avoid_cache_randomness\n        class DummyTransform(BaseTransform):\n\n            @cache_randomness\n            def func(self):\n                pass\n\n            def transform(self, results):\n                return results\n\n    # Test avoid_cache_randomness: raise error in cache_random_params\n    with pytest.raises(RuntimeError):\n\n        @avoid_cache_randomness\n        class DummyTransform(BaseTransform):\n\n            def transform(self, results):\n                return results\n\n        transform = DummyTransform()\n        with cache_random_params(transform):\n            pass\n\n    # Test avoid_cache_randomness: non-inheritable\n    @avoid_cache_randomness\n    class DummyBaseTransform(BaseTransform):\n\n        def transform(self, results):\n            return results\n\n    class DummyTransform(DummyBaseTransform):\n        pass\n\n    transform = DummyTransform()\n    with cache_random_params(transform):\n        pass\n"
  },
  {
    "path": "tests/test_utils/test_env.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport sys\n\nimport pytest\n\nimport mmcv\n\n\ndef test_collect_env():\n    try:\n        import torch  # noqa: F401\n    except ModuleNotFoundError:\n        pytest.skip('skipping tests that require PyTorch')\n\n    from mmcv.utils import collect_env\n    env_info = collect_env()\n    expected_keys = [\n        'sys.platform', 'Python', 'CUDA available', 'PyTorch',\n        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',\n        'MMCV CUDA Compiler'\n    ]\n    for key in expected_keys:\n        assert key in env_info\n\n    if env_info['CUDA available']:\n        for key in ['CUDA_HOME', 'NVCC']:\n            assert key in env_info\n\n    if sys.platform == 'win32':\n        assert 'MSVC' in env_info\n\n    assert env_info['sys.platform'] == sys.platform\n    assert env_info['Python'] == sys.version.replace('\\n', '')\n    assert env_info['MMCV'] == mmcv.__version__\n"
  },
  {
    "path": "tests/test_utils/test_parrots_jit.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport pytest\nimport torch\nfrom mmengine.utils.dl_utils import TORCH_VERSION\n\nimport mmcv\n\npytest.skip('this test not ready now', allow_module_level=True)\nskip_no_parrots = pytest.mark.skipif(\n    TORCH_VERSION != 'parrots', reason='test case under parrots environment')\n\n\nclass TestJit:\n\n    def test_add_dict(self):\n\n        @mmcv.jit\n        def add_dict(oper):\n            rets = oper['x'] + oper['y']\n            return {'result': rets}\n\n        def add_dict_pyfunc(oper):\n            rets = oper['x'] + oper['y']\n            return {'result': rets}\n\n        a = torch.rand((3, 4))\n        b = torch.rand((3, 4))\n        oper = {'x': a, 'y': b}\n\n        rets_t = add_dict(oper)\n        rets = add_dict_pyfunc(oper)\n        assert 'result' in rets\n        assert (rets_t['result'] == rets['result']).all()\n\n    def test_add_list(self):\n\n        @mmcv.jit\n        def add_list(oper, x, y):\n            rets = {}\n            for idx, pair in enumerate(oper):\n                rets[f'k{idx}'] = pair['x'] + pair['y']\n            rets[f'k{len(oper)}'] = x + y\n            return rets\n\n        def add_list_pyfunc(oper, x, y):\n            rets = {}\n            for idx, pair in enumerate(oper):\n                rets[f'k{idx}'] = pair['x'] + pair['y']\n            rets[f'k{len(oper)}'] = x + y\n            return rets\n\n        pair_num = 3\n        oper = []\n        for _ in range(pair_num):\n            oper.append({'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))})\n        a = torch.rand((3, 4))\n        b = torch.rand((3, 4))\n        rets = add_list_pyfunc(oper, x=a, y=b)\n        rets_t = add_list(oper, x=a, y=b)\n        for idx in range(pair_num + 1):\n            assert f'k{idx}' in rets_t\n            assert (rets[f'k{idx}'] == rets_t[f'k{idx}']).all()\n\n    @skip_no_parrots\n    def test_jit_cache(self):\n\n        @mmcv.jit\n        def func(oper):\n            if oper['const'] > 1:\n                return oper['x'] * 2 + oper['y']\n            else:\n                return oper['x'] * 2 - oper['y']\n\n        def pyfunc(oper):\n            if oper['const'] > 1:\n                return oper['x'] * 2 + oper['y']\n            else:\n                return oper['x'] * 2 - oper['y']\n\n        assert len(func._cache._cache) == 0\n\n        oper = {'const': 2, 'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))}\n        rets_plus = pyfunc(oper)\n        rets_plus_t = func(oper)\n        assert (rets_plus == rets_plus_t).all()\n        assert len(func._cache._cache) == 1\n\n        oper['const'] = 0.5\n        rets_minus = pyfunc(oper)\n        rets_minus_t = func(oper)\n        assert (rets_minus == rets_minus_t).all()\n        assert len(func._cache._cache) == 2\n\n        rets_a = (rets_minus_t + rets_plus_t) / 4\n        assert torch.allclose(oper['x'], rets_a)\n\n    @skip_no_parrots\n    def test_jit_shape(self):\n\n        @mmcv.jit\n        def func(a):\n            return a + 1\n\n        assert len(func._cache._cache) == 0\n\n        a = torch.ones((3, 4))\n        r = func(a)\n        assert r.shape == (3, 4)\n        assert (r == 2).all()\n        assert len(func._cache._cache) == 1\n\n        a = torch.ones((2, 3, 4))\n        r = func(a)\n        assert r.shape == (2, 3, 4)\n        assert (r == 2).all()\n        assert len(func._cache._cache) == 2\n\n    @skip_no_parrots\n    def test_jit_kwargs(self):\n\n        @mmcv.jit\n        def func(a, b):\n            return torch.mean((a - b) * (a - b))\n\n        assert len(func._cache._cache) == 0\n        x = torch.rand((16, 32))\n        y = torch.rand((16, 32))\n        func(x, y)\n        assert len(func._cache._cache) == 1\n        func(x, b=y)\n        assert len(func._cache._cache) == 1\n        func(b=y, a=x)\n        assert len(func._cache._cache) == 1\n\n    def test_jit_derivate(self):\n\n        @mmcv.jit(derivate=True)\n        def func(x, y):\n            return (x + 2) * (y - 2)\n\n        a = torch.rand((3, 4))\n        b = torch.rand((3, 4))\n        a.requires_grad = True\n\n        c = func(a, b)\n        assert c.requires_grad\n        d = torch.empty_like(c)\n        d.fill_(1.0)\n        c.backward(d)\n        assert torch.allclose(a.grad, (b - 2))\n        assert b.grad is None\n\n        a.grad = None\n        c = func(a, b)\n        assert c.requires_grad\n        d = torch.empty_like(c)\n        d.fill_(2.7)\n        c.backward(d)\n        assert torch.allclose(a.grad, 2.7 * (b - 2))\n        assert b.grad is None\n\n    def test_jit_optimize(self):\n\n        @mmcv.jit(optimize=True)\n        def func(a, b):\n            return torch.mean((a - b) * (a - b))\n\n        def pyfunc(a, b):\n            return torch.mean((a - b) * (a - b))\n\n        a = torch.rand((16, 32))\n        b = torch.rand((16, 32))\n\n        c = func(a, b)\n        d = pyfunc(a, b)\n        assert torch.allclose(c, d)\n\n    @mmcv.skip_no_elena\n    def test_jit_coderize(self):\n        if not torch.cuda.is_available():\n            return\n\n        @mmcv.jit(coderize=True)\n        def func(a, b):\n            return (a + b) * (a - b)\n\n        def pyfunc(a, b):\n            return (a + b) * (a - b)\n\n        a = torch.rand((16, 32), device='cuda')\n        b = torch.rand((16, 32), device='cuda')\n\n        c = func(a, b)\n        d = pyfunc(a, b)\n        assert torch.allclose(c, d)\n\n    def test_jit_value_dependent(self):\n\n        @mmcv.jit\n        def func(a, b):\n            torch.nonzero(a)\n            return torch.mean((a - b) * (a - b))\n\n        def pyfunc(a, b):\n            torch.nonzero(a)\n            return torch.mean((a - b) * (a - b))\n\n        a = torch.rand((16, 32))\n        b = torch.rand((16, 32))\n\n        c = func(a, b)\n        d = pyfunc(a, b)\n        assert torch.allclose(c, d)\n\n    @skip_no_parrots\n    def test_jit_check_input(self):\n\n        def func(x):\n            y = torch.rand_like(x)\n            return x + y\n\n        a = torch.ones((3, 4))\n        with pytest.raises(AssertionError):\n            func = mmcv.jit(func, check_input=(a, ))\n\n    @skip_no_parrots\n    def test_jit_partial_shape(self):\n\n        @mmcv.jit(full_shape=False)\n        def func(a, b):\n            return torch.mean((a - b) * (a - b))\n\n        def pyfunc(a, b):\n            return torch.mean((a - b) * (a - b))\n\n        a = torch.rand((3, 4))\n        b = torch.rand((3, 4))\n        assert torch.allclose(func(a, b), pyfunc(a, b))\n        assert len(func._cache._cache) == 1\n\n        a = torch.rand((6, 5))\n        b = torch.rand((6, 5))\n        assert torch.allclose(func(a, b), pyfunc(a, b))\n        assert len(func._cache._cache) == 1\n\n        a = torch.rand((3, 4, 5))\n        b = torch.rand((3, 4, 5))\n        assert torch.allclose(func(a, b), pyfunc(a, b))\n        assert len(func._cache._cache) == 2\n\n        a = torch.rand((1, 9, 8))\n        b = torch.rand((1, 9, 8))\n        assert torch.allclose(func(a, b), pyfunc(a, b))\n        assert len(func._cache._cache) == 2\n\n    def test_instance_method(self):\n\n        class T:\n\n            def __init__(self, shape):\n                self._c = torch.rand(shape)\n\n            @mmcv.jit\n            def test_method(self, x, y):\n                return (x * self._c) + y\n\n        shape = (16, 32)\n        t = T(shape)\n        a = torch.rand(shape)\n        b = torch.rand(shape)\n        res = (a * t._c) + b\n        jit_res = t.test_method(a, b)\n        assert torch.allclose(res, jit_res)\n\n        t = T(shape)\n        res = (a * t._c) + b\n        jit_res = t.test_method(a, b)\n        assert torch.allclose(res, jit_res)\n"
  },
  {
    "path": "tests/test_video/test_optflow.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport os.path as osp\nimport tempfile\n\nimport cv2\nimport numpy as np\nimport pytest\nfrom numpy.testing import assert_array_almost_equal, assert_array_equal\n\nimport mmcv\n\n\ndef test_flowread():\n    data_dir = osp.join(osp.dirname(__file__), '../data')\n    flow_shape = (60, 80, 2)\n\n    # read .flo file\n    flow = mmcv.flowread(osp.join(data_dir, 'optflow.flo'))\n    assert flow.shape == flow_shape\n\n    # pseudo read\n    flow_same = mmcv.flowread(flow)\n    assert_array_equal(flow, flow_same)\n\n    # read quantized flow concatenated vertically\n    flow = mmcv.flowread(\n        osp.join(data_dir, 'optflow_concat0.jpg'), quantize=True, denorm=True)\n    assert flow.shape == flow_shape\n\n    # read quantized flow concatenated horizontally\n    flow = mmcv.flowread(\n        osp.join(data_dir, 'optflow_concat1.jpg'),\n        quantize=True,\n        concat_axis=1,\n        denorm=True)\n    assert flow.shape == flow_shape\n\n    # test exceptions\n    notflow_file = osp.join(data_dir, 'color.jpg')\n    with pytest.raises(TypeError):\n        mmcv.flowread(1)\n    with pytest.raises(IOError):\n        mmcv.flowread(notflow_file)\n    with pytest.raises(IOError):\n        mmcv.flowread(notflow_file, quantize=True)\n    with pytest.raises(ValueError):\n        mmcv.flowread(np.zeros((100, 100, 1)))\n\n\ndef test_flowwrite():\n    flow = np.random.rand(100, 100, 2).astype(np.float32)\n\n    # write to a .flo file\n    tmp_filehandler, filename = tempfile.mkstemp()\n    mmcv.flowwrite(flow, filename)\n    flow_from_file = mmcv.flowread(filename)\n    assert_array_equal(flow, flow_from_file)\n    os.close(tmp_filehandler)\n    os.remove(filename)\n\n    # write to two .jpg files\n    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_flow.jpg')\n    for concat_axis in range(2):\n        mmcv.flowwrite(\n            flow, tmp_filename, quantize=True, concat_axis=concat_axis)\n        shape = (200, 100) if concat_axis == 0 else (100, 200)\n        assert osp.isfile(tmp_filename)\n        assert mmcv.imread(tmp_filename, flag='unchanged').shape == shape\n        os.remove(tmp_filename)\n\n    # test exceptions\n    with pytest.raises(AssertionError):\n        mmcv.flowwrite(flow, tmp_filename, quantize=True, concat_axis=2)\n\n\ndef test_quantize_flow():\n    flow = (np.random.rand(10, 8, 2).astype(np.float32) - 0.5) * 15\n    max_val = 5.0\n    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=False)\n    ref = np.zeros_like(flow, dtype=np.uint8)\n    for i in range(ref.shape[0]):\n        for j in range(ref.shape[1]):\n            for k in range(ref.shape[2]):\n                val = flow[i, j, k] + max_val\n                val = min(max(val, 0), 2 * max_val)\n                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)\n    assert_array_equal(dx, ref[..., 0])\n    assert_array_equal(dy, ref[..., 1])\n    max_val = 0.5\n    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=True)\n    ref = np.zeros_like(flow, dtype=np.uint8)\n    for i in range(ref.shape[0]):\n        for j in range(ref.shape[1]):\n            for k in range(ref.shape[2]):\n                scale = flow.shape[1] if k == 0 else flow.shape[0]\n                val = flow[i, j, k] / scale + max_val\n                val = min(max(val, 0), 2 * max_val)\n                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)\n    assert_array_equal(dx, ref[..., 0])\n    assert_array_equal(dy, ref[..., 1])\n\n\ndef test_dequantize_flow():\n    dx = np.random.randint(256, size=(10, 8), dtype=np.uint8)\n    dy = np.random.randint(256, size=(10, 8), dtype=np.uint8)\n    max_val = 5.0\n    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=False)\n    ref = np.zeros_like(flow, dtype=np.float32)\n    for i in range(ref.shape[0]):\n        for j in range(ref.shape[1]):\n            ref[i, j, 0] = float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val\n            ref[i, j, 1] = float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val\n    assert_array_almost_equal(flow, ref)\n    max_val = 0.5\n    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=True)\n    h, w = dx.shape\n    ref = np.zeros_like(flow, dtype=np.float32)\n    for i in range(ref.shape[0]):\n        for j in range(ref.shape[1]):\n            ref[i, j,\n                0] = (float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val) * w\n            ref[i, j,\n                1] = (float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val) * h\n    assert_array_almost_equal(flow, ref)\n\n\ndef test_flow2rgb():\n    flow = np.array([[[0, 0], [0.5, 0.5], [1, 1], [2, 1], [3, np.inf]]],\n                    dtype=np.float32)\n    flow_img = mmcv.flow2rgb(flow)\n    # yapf: disable\n    assert_array_almost_equal(\n        flow_img,\n        np.array([[[1., 1., 1.],\n                   [1., 0.826074731, 0.683772236],\n                   [1., 0.652149462, 0.367544472],\n                   [1., 0.265650552, 5.96046448e-08],\n                   [0., 0., 0.]]],\n                 dtype=np.float32))\n    # yapf: enable\n\n\ndef test_flow_warp():\n\n    img = np.zeros((5, 5, 3))\n    img[2, 2, 0] = 1\n    flow = np.ones((5, 5, 2))\n\n    res_nn = mmcv.flow_warp(img, flow, interpolate_mode='nearest')\n    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')\n\n    assert_array_almost_equal(res_nn, res_bi, decimal=5)\n\n    img = np.zeros((5, 5, 1))\n    img[2, 2, 0] = 1\n    img[2, 3, 0] = 0.75\n    flow = np.zeros((5, 5, 2))\n    flow[2, 2, :] = [0.5, 0.7]\n\n    res_ = np.copy(img)\n    res_[2, 2] = 0.5 * 0.3 + 0.75 * 0.5 * 0.3\n    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')\n    assert_array_almost_equal(res_, res_bi, decimal=5)\n\n    with pytest.raises(NotImplementedError):\n        _ = mmcv.flow_warp(img, flow, interpolate_mode='xxx')\n\n    with pytest.raises(AssertionError):\n        _ = mmcv.flow_warp(img, flow[:, :, 0], interpolate_mode='xxx')\n\n\ndef test_make_color_wheel():\n    default_color_wheel = mmcv.make_color_wheel()\n    color_wheel = mmcv.make_color_wheel([2, 2, 2, 2, 2, 2])\n    # yapf: disable\n    assert_array_equal(default_color_wheel, np.array(\n        [[1.       , 0.        , 0.        ],  # noqa\n        [1.        , 0.06666667, 0.        ],  # noqa\n        [1.        , 0.13333334, 0.        ],  # noqa\n        [1.        , 0.2       , 0.        ],  # noqa\n        [1.        , 0.26666668, 0.        ],  # noqa\n        [1.        , 0.33333334, 0.        ],  # noqa\n        [1.        , 0.4       , 0.        ],  # noqa\n        [1.        , 0.46666667, 0.        ],  # noqa\n        [1.        , 0.53333336, 0.        ],  # noqa\n        [1.        , 0.6       , 0.        ],  # noqa\n        [1.        , 0.6666667 , 0.        ],  # noqa\n        [1.        , 0.73333335, 0.        ],  # noqa\n        [1.        , 0.8       , 0.        ],  # noqa\n        [1.        , 0.8666667 , 0.        ],  # noqa\n        [1.        , 0.93333334, 0.        ],  # noqa\n        [1.        , 1.        , 0.        ],  # noqa\n        [0.8333333 , 1.        , 0.        ],  # noqa\n        [0.6666667 , 1.        , 0.        ],  # noqa\n        [0.5       , 1.        , 0.        ],  # noqa\n        [0.33333334, 1.        , 0.        ],  # noqa\n        [0.16666667, 1.        , 0.        ],  # noqa\n        [0.        , 1.        , 0.        ],  # noqa\n        [0.        , 1.        , 0.25      ],  # noqa\n        [0.        , 1.        , 0.5       ],  # noqa\n        [0.        , 1.        , 0.75      ],  # noqa\n        [0.        , 1.        , 1.        ],  # noqa\n        [0.        , 0.90909094, 1.        ],  # noqa\n        [0.        , 0.8181818 , 1.        ],  # noqa\n        [0.        , 0.72727275, 1.        ],  # noqa\n        [0.        , 0.6363636 , 1.        ],  # noqa\n        [0.        , 0.54545456, 1.        ],  # noqa\n        [0.        , 0.45454547, 1.        ],  # noqa\n        [0.        , 0.36363637, 1.        ],  # noqa\n        [0.        , 0.27272728, 1.        ],  # noqa\n        [0.        , 0.18181819, 1.        ],  # noqa\n        [0.        , 0.09090909, 1.        ],  # noqa\n        [0.        , 0.        , 1.        ],  # noqa\n        [0.07692308, 0.        , 1.        ],  # noqa\n        [0.15384616, 0.        , 1.        ],  # noqa\n        [0.23076923, 0.        , 1.        ],  # noqa\n        [0.30769232, 0.        , 1.        ],  # noqa\n        [0.3846154 , 0.        , 1.        ],  # noqa\n        [0.46153846, 0.        , 1.        ],  # noqa\n        [0.53846157, 0.        , 1.        ],  # noqa\n        [0.61538464, 0.        , 1.        ],  # noqa\n        [0.6923077 , 0.        , 1.        ],  # noqa\n        [0.7692308 , 0.        , 1.        ],  # noqa\n        [0.84615386, 0.        , 1.        ],  # noqa\n        [0.9230769 , 0.        , 1.        ],  # noqa\n        [1.        , 0.        , 1.        ],  # noqa\n        [1.        , 0.        , 0.8333333 ],  # noqa\n        [1.        , 0.        , 0.6666667 ],  # noqa\n        [1.        , 0.        , 0.5       ],  # noqa\n        [1.        , 0.        , 0.33333334],  # noqa\n        [1.        , 0.        , 0.16666667]], dtype=np.float32))  # noqa\n\n    assert_array_equal(\n        color_wheel,\n        np.array([[1., 0. , 0. ],  # noqa\n                 [1. , 0.5, 0. ],  # noqa\n                 [1. , 1. , 0. ],  # noqa\n                 [0.5, 1. , 0. ],  # noqa\n                 [0. , 1. , 0. ],  # noqa\n                 [0. , 1. , 0.5],  # noqa\n                 [0. , 1. , 1. ],  # noqa\n                 [0. , 0.5, 1. ],  # noqa\n                 [0. , 0. , 1. ],  # noqa\n                 [0.5, 0. , 1. ],  # noqa\n                 [1. , 0. , 1. ],  # noqa\n                 [1. , 0. , 0.5]], dtype=np.float32))  # noqa\n    # yapf: enable\n\n\ndef test_flow_from_bytes():\n    data_dir = osp.join(osp.dirname(__file__), '../data')\n    flow_shape = (60, 80, 2)\n    flow_file = osp.join(data_dir, 'optflow.flo')\n\n    # read .flo file\n    flow_fromfile = mmcv.flowread(flow_file)\n\n    with open(flow_file, 'rb') as f:\n        flow_bytes = f.read()\n    flow_frombytes = mmcv.flow_from_bytes(flow_bytes)\n\n    assert flow_frombytes.shape == flow_shape\n    assert np.all(flow_frombytes == flow_fromfile)\n\n\ndef test_sparse_flow_from_bytes():\n    data_dir = osp.join(osp.dirname(__file__), '../data')\n    flow_file = osp.join(data_dir, 'sparse_flow.png')\n\n    with open(flow_file, 'rb') as f:\n        flow_bytes = f.read()\n    # read flow from bytes\n    flow_frombytes, valid_frombytes = mmcv.sparse_flow_from_bytes(flow_bytes)\n\n    # test flow shape is [H, W, 2] and valid shape is [H, W]\n    assert flow_frombytes.shape[:2] == valid_frombytes.shape\n    assert flow_frombytes.shape[2] == 2\n\n    def read_sparse_flow_from_file():\n        flow = cv2.imread(flow_file, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)\n        flow = flow[:, :, ::-1].astype(np.float32)\n        flow, valid = flow[:, :, :2], flow[:, :, 2]\n        flow = (flow - 2**15) / 64.0\n        return flow, valid\n\n    # read flow from file\n    flow_flowfile, valid_fromfile = read_sparse_flow_from_file()\n\n    assert np.all(flow_frombytes == flow_flowfile)\n    assert np.all(valid_frombytes == valid_fromfile)\n"
  },
  {
    "path": "tests/test_video/test_processing.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport os.path as osp\nimport platform\nimport tempfile\n\nimport pytest\n\nimport mmcv\n\n\nclass TestVideoEditor:\n\n    @classmethod\n    def setup_class(cls):\n        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')\n        cls.num_frames = 168\n\n    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')\n    def test_cut_concat_video(self):\n        part1_file = osp.join(tempfile.gettempdir(), '.mmcv_test1.mp4')\n        part2_file = osp.join(tempfile.gettempdir(), '.mmcv_test2.mp4')\n        mmcv.cut_video(self.video_path, part1_file, end=3, vcodec='h264')\n        mmcv.cut_video(self.video_path, part2_file, start=3, vcodec='h264')\n        v1 = mmcv.VideoReader(part1_file)\n        v2 = mmcv.VideoReader(part2_file)\n        assert len(v1) == 75\n        assert len(v2) == self.num_frames - 75\n\n        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')\n        mmcv.concat_video([part1_file, part2_file], out_file)\n        v = mmcv.VideoReader(out_file)\n        assert len(v) == self.num_frames\n        os.remove(part1_file)\n        os.remove(part2_file)\n        os.remove(out_file)\n\n    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')\n    def test_resize_video(self):\n        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')\n        mmcv.resize_video(\n            self.video_path, out_file, (200, 100), log_level='panic')\n        v = mmcv.VideoReader(out_file)\n        assert v.resolution == (200, 100)\n        os.remove(out_file)\n        mmcv.resize_video(self.video_path, out_file, ratio=2)\n        v = mmcv.VideoReader(out_file)\n        assert v.resolution == (294 * 2, 240 * 2)\n        os.remove(out_file)\n        mmcv.resize_video(self.video_path, out_file, (1000, 480), keep_ar=True)\n        v = mmcv.VideoReader(out_file)\n        assert v.resolution == (294 * 2, 240 * 2)\n        os.remove(out_file)\n        mmcv.resize_video(\n            self.video_path, out_file, ratio=(2, 1.5), keep_ar=True)\n        v = mmcv.VideoReader(out_file)\n        assert v.resolution == (294 * 2, 360)\n        os.remove(out_file)\n"
  },
  {
    "path": "tests/test_video/test_reader.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport os\nimport os.path as osp\nimport shutil\nimport tempfile\nfrom collections import OrderedDict\n\nimport pytest\n\nimport mmcv\n\n\nclass TestCache:\n\n    def test_init(self):\n        with pytest.raises(ValueError):\n            mmcv.Cache(0)\n        cache = mmcv.Cache(100)\n        assert cache.capacity == 100\n        assert cache.size == 0\n\n    def test_put(self):\n        cache = mmcv.Cache(3)\n        for i in range(1, 4):\n            cache.put(f'k{i}', i)\n            assert cache.size == i\n        assert cache._cache == OrderedDict([('k1', 1), ('k2', 2), ('k3', 3)])\n        cache.put('k4', 4)\n        assert cache.size == 3\n        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])\n        cache.put('k2', 2)\n        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])\n\n    def test_get(self):\n        cache = mmcv.Cache(3)\n        assert cache.get('key_none') is None\n        assert cache.get('key_none', 0) == 0\n        cache.put('k1', 1)\n        assert cache.get('k1') == 1\n\n\nclass TestVideoReader:\n\n    @classmethod\n    def setup_class(cls):\n        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')\n        cls.num_frames = 168\n        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501\n\n    def test_load(self):\n        # read from video file\n        v = mmcv.VideoReader(self.video_path)\n        assert v.width == 294\n        assert v.height == 240\n        assert v.fps == 25\n        assert v.frame_cnt == self.num_frames\n        assert len(v) == self.num_frames\n        assert v.opened\n        import cv2\n        assert isinstance(v.vcap, type(cv2.VideoCapture()))\n\n        # read from video url\n        v = mmcv.VideoReader(self.video_url)\n        assert v.width == 320\n        assert v.height == 240\n        assert v.fps == 15\n        assert v.frame_cnt == 1889\n        assert len(v) == 1889\n        assert v.opened\n        assert isinstance(v.vcap, type(cv2.VideoCapture()))\n\n    def test_read(self):\n        v = mmcv.VideoReader(self.video_path)\n        img = v.read()\n        assert int(round(img.mean())) == 94\n        img = v.get_frame(63)\n        assert int(round(img.mean())) == 94\n        img = v[64]\n        assert int(round(img.mean())) == 205\n        img = v[-104]\n        assert int(round(img.mean())) == 205\n        img = v[63]\n        assert int(round(img.mean())) == 94\n        img = v[-105]\n        assert int(round(img.mean())) == 94\n        img = v.read()\n        assert int(round(img.mean())) == 205\n        with pytest.raises(IndexError):\n            v.get_frame(self.num_frames + 1)\n        with pytest.raises(IndexError):\n            v[-self.num_frames - 1]\n\n    def test_slice(self):\n        v = mmcv.VideoReader(self.video_path)\n        imgs = v[-105:-103]\n        assert int(round(imgs[0].mean())) == 94\n        assert int(round(imgs[1].mean())) == 205\n        assert len(imgs) == 2\n        imgs = v[63:65]\n        assert int(round(imgs[0].mean())) == 94\n        assert int(round(imgs[1].mean())) == 205\n        assert len(imgs) == 2\n        imgs = v[64:62:-1]\n        assert int(round(imgs[0].mean())) == 205\n        assert int(round(imgs[1].mean())) == 94\n        assert len(imgs) == 2\n        imgs = v[:5]\n        assert len(imgs) == 5\n        for img in imgs:\n            assert int(round(img.mean())) == 94\n        imgs = v[165:]\n        assert len(imgs) == 3\n        for img in imgs:\n            assert int(round(img.mean())) == 0\n        imgs = v[-3:]\n        assert len(imgs) == 3\n        for img in imgs:\n            assert int(round(img.mean())) == 0\n\n    def test_current_frame(self):\n        v = mmcv.VideoReader(self.video_path)\n        assert v.current_frame() is None\n        v.read()\n        img = v.current_frame()\n        assert int(round(img.mean())) == 94\n\n    def test_position(self):\n        v = mmcv.VideoReader(self.video_path)\n        assert v.position == 0\n        for _ in range(10):\n            v.read()\n        assert v.position == 10\n        v.get_frame(99)\n        assert v.position == 100\n\n    def test_iterator(self):\n        cnt = 0\n        for img in mmcv.VideoReader(self.video_path):\n            cnt += 1\n            assert img.shape == (240, 294, 3)\n        assert cnt == self.num_frames\n\n    def test_with(self):\n        with mmcv.VideoReader(self.video_path) as v:\n            assert v.opened\n        assert not v.opened\n\n    def test_cvt2frames(self):\n        v = mmcv.VideoReader(self.video_path)\n        frame_dir = tempfile.mkdtemp()\n        v.cvt2frames(frame_dir)\n        assert osp.isdir(frame_dir)\n        for i in range(self.num_frames):\n            filename = f'{frame_dir}/{i:06d}.jpg'\n            assert osp.isfile(filename)\n            os.remove(filename)\n\n        v = mmcv.VideoReader(self.video_path)\n        v.cvt2frames(frame_dir, show_progress=False)\n        assert osp.isdir(frame_dir)\n        for i in range(self.num_frames):\n            filename = f'{frame_dir}/{i:06d}.jpg'\n            assert osp.isfile(filename)\n            os.remove(filename)\n\n        v = mmcv.VideoReader(self.video_path)\n        v.cvt2frames(\n            frame_dir,\n            file_start=100,\n            filename_tmpl='{:03d}.JPEG',\n            start=100,\n            max_num=20)\n        assert osp.isdir(frame_dir)\n        for i in range(100, 120):\n            filename = f'{frame_dir}/{i:03d}.JPEG'\n            assert osp.isfile(filename)\n            os.remove(filename)\n        shutil.rmtree(frame_dir)\n\n    def test_frames2video(self):\n        v = mmcv.VideoReader(self.video_path)\n        frame_dir = tempfile.mkdtemp()\n        v.cvt2frames(frame_dir)\n        assert osp.isdir(frame_dir)\n        for i in range(self.num_frames):\n            filename = f'{frame_dir}/{i:06d}.jpg'\n            assert osp.isfile(filename)\n\n        out_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.avi')\n        mmcv.frames2video(frame_dir, out_filename)\n        v = mmcv.VideoReader(out_filename)\n        assert v.fps == 30\n        assert len(v) == self.num_frames\n\n        mmcv.frames2video(\n            frame_dir,\n            out_filename,\n            fps=25,\n            start=10,\n            end=50,\n            show_progress=False)\n\n        with mmcv.VideoReader(out_filename) as v:\n            assert v.fps == 25\n            assert len(v) == 40\n\n            for i in range(self.num_frames):\n                filename = f'{frame_dir}/{i:06d}.jpg'\n                os.remove(filename)\n            shutil.rmtree(frame_dir)\n"
  },
  {
    "path": "tests/test_visualization.py",
    "content": "# Copyright (c) OpenMMLab. All rights reserved.\nimport numpy as np\nimport pytest\n\nimport mmcv\n\n\ndef test_color():\n    assert mmcv.color_val(mmcv.Color.blue) == (255, 0, 0)\n    assert mmcv.color_val('green') == (0, 255, 0)\n    assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)\n    assert mmcv.color_val(100) == (100, 100, 100)\n    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)\n    with pytest.raises(TypeError):\n        mmcv.color_val([255, 255, 255])\n    with pytest.raises(TypeError):\n        mmcv.color_val(1.0)\n    with pytest.raises(AssertionError):\n        mmcv.color_val((0, 0, 500))\n"
  }
]