Repository: hancyran/RepSurf
Branch: main
Commit: 29bacd52fe61
Files: 115
Total size: 2.6 MB
Directory structure:
gitextract_ziqlnw07/
├── .gitignore
├── LICENSE.txt
├── README.md
├── classification/
│ ├── README.md
│ ├── dataset/
│ │ ├── ScanObjectNNDataLoader.py
│ │ └── __init__.py
│ ├── init.sh
│ ├── models/
│ │ ├── __init__.py
│ │ └── repsurf/
│ │ ├── __init__.py
│ │ ├── repsurf_ssg_umb.py
│ │ └── repsurf_ssg_umb_2x.py
│ ├── modules/
│ │ ├── __init__.py
│ │ ├── pointnet2_utils.py
│ │ ├── pointops/
│ │ │ ├── __init__.py
│ │ │ ├── functions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pointops.py
│ │ │ ├── setup.py
│ │ │ └── src/
│ │ │ ├── __init__.py
│ │ │ ├── ballquery/
│ │ │ │ ├── ballquery_cuda.cpp
│ │ │ │ ├── ballquery_cuda_kernel.cu
│ │ │ │ └── ballquery_cuda_kernel.h
│ │ │ ├── cuda_utils.h
│ │ │ ├── grouping/
│ │ │ │ ├── grouping_cuda.cpp
│ │ │ │ ├── grouping_cuda_kernel.cu
│ │ │ │ └── grouping_cuda_kernel.h
│ │ │ ├── grouping_int/
│ │ │ │ ├── grouping_int_cuda.cpp
│ │ │ │ ├── grouping_int_cuda_kernel.cu
│ │ │ │ └── grouping_int_cuda_kernel.h
│ │ │ ├── interpolation/
│ │ │ │ ├── interpolation_cuda.cpp
│ │ │ │ ├── interpolation_cuda_kernel.cu
│ │ │ │ └── interpolation_cuda_kernel.h
│ │ │ ├── knnquery/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── knnquery_cuda.cpp
│ │ │ │ ├── knnquery_cuda_kernel.cu
│ │ │ │ └── knnquery_cuda_kernel.h
│ │ │ ├── knnquery_heap/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── knnquery_heap_cuda.cpp
│ │ │ │ ├── knnquery_heap_cuda_kernel.cu
│ │ │ │ └── knnquery_heap_cuda_kernel.h
│ │ │ ├── pointops_api.cpp
│ │ │ └── sampling/
│ │ │ ├── sampling_cuda.cpp
│ │ │ ├── sampling_cuda_kernel.cu
│ │ │ └── sampling_cuda_kernel.h
│ │ ├── polar_utils.py
│ │ ├── ptaug_utils.py
│ │ ├── recons_utils.py
│ │ └── repsurface_utils.py
│ ├── scripts/
│ │ └── scanobjectnn/
│ │ ├── repsurf_ssg_umb.sh
│ │ └── repsurf_ssg_umb_2x.sh
│ ├── tool/
│ │ └── train_cls_scanobjectnn.py
│ └── util/
│ ├── __init__.py
│ └── utils.py
├── segmentation/
│ ├── README.md
│ ├── dataset/
│ │ ├── S3DISDataLoader.py
│ │ └── __init__.py
│ ├── init.sh
│ ├── models/
│ │ ├── __init__.py
│ │ ├── pointnet2/
│ │ │ ├── __init__.py
│ │ │ └── pointnet2_ssg.py
│ │ ├── pointtransformer/
│ │ │ ├── __init__.py
│ │ │ └── pointtransformer.py
│ │ └── repsurf/
│ │ ├── __init__.py
│ │ └── repsurf_umb_ssg.py
│ ├── modules/
│ │ ├── __init__.py
│ │ ├── aug_utils.py
│ │ ├── pointnet2_utils.py
│ │ ├── pointops/
│ │ │ ├── __init__.py
│ │ │ ├── functions/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pointops.py
│ │ │ ├── setup.py
│ │ │ └── src/
│ │ │ ├── __init__.py
│ │ │ ├── aggregation/
│ │ │ │ ├── aggregation_cuda.cpp
│ │ │ │ ├── aggregation_cuda_kernel.cu
│ │ │ │ └── aggregation_cuda_kernel.h
│ │ │ ├── cuda_utils.h
│ │ │ ├── grouping/
│ │ │ │ ├── grouping_cuda.cpp
│ │ │ │ ├── grouping_cuda_kernel.cu
│ │ │ │ └── grouping_cuda_kernel.h
│ │ │ ├── interpolation/
│ │ │ │ ├── interpolation_cuda.cpp
│ │ │ │ ├── interpolation_cuda_kernel.cu
│ │ │ │ └── interpolation_cuda_kernel.h
│ │ │ ├── knnquery/
│ │ │ │ ├── knnquery_cuda.cpp
│ │ │ │ ├── knnquery_cuda_kernel.cu
│ │ │ │ └── knnquery_cuda_kernel.h
│ │ │ ├── pointops_api.cpp
│ │ │ ├── sampling/
│ │ │ │ ├── sampling_cuda.cpp
│ │ │ │ ├── sampling_cuda_kernel.cu
│ │ │ │ └── sampling_cuda_kernel.h
│ │ │ └── subtraction/
│ │ │ ├── subtraction_cuda.cpp
│ │ │ ├── subtraction_cuda_kernel.cu
│ │ │ └── subtraction_cuda_kernel.h
│ │ ├── pointtransformer_utils.py
│ │ ├── polar_utils.py
│ │ ├── recons_utils.py
│ │ ├── repsurface_utils.py
│ │ └── voxelize_utils.py
│ ├── scripts/
│ │ └── s3dis/
│ │ ├── test_pointnet2.sh
│ │ ├── test_pointtransformer.sh
│ │ ├── test_repsurf_umb.sh
│ │ ├── train_pointnet2.sh
│ │ ├── train_pointtransformer.sh
│ │ └── train_repsurf_umb.sh
│ ├── tool/
│ │ ├── test_s3dis.py
│ │ └── train.py
│ └── util/
│ ├── __init__.py
│ ├── data_util.py
│ └── utils.py
└── visualization/
├── airplane_0001.txt
├── bed_0001.txt
├── cup_0001.txt
├── table_0250.txt
├── triangled_airplane.obj
├── triangled_bed.obj
├── triangled_cup.obj
└── triangled_table.obj
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
================================================
FILE: LICENSE.txt
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2022 Haoxi Ran.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# RepSurf - Surface Representation for Point Clouds
[CVPR 2022 Oral]
By *[Haoxi Ran\*](https://hancyran.github.io/) , Jun Liu, Chengjie Wang* ( * : corresponding contact)
[](https://paperswithcode.com/sota/3d-point-cloud-classification-on-scanobjectnn?p=surface-representation-for-point-clouds)
[](https://paperswithcode.com/sota/3d-object-detection-on-sun-rgbd-val?p=surface-representation-for-point-clouds)
[](https://paperswithcode.com/sota/3d-point-cloud-classification-on-modelnet40?p=surface-representation-for-point-clouds)
[](https://paperswithcode.com/sota/semantic-segmentation-on-s3dis?p=surface-representation-for-point-clouds)
[](https://paperswithcode.com/sota/3d-object-detection-on-scannetv2?p=surface-representation-for-point-clouds)
[](https://paperswithcode.com/sota/semantic-segmentation-on-s3dis-area5?p=surface-representation-for-point-clouds)
### The pytorch official implementation of "[Surface Representation for Point Clouds](http://arxiv.org/abs/2205.05740)"
### [PDF](https://openaccess.thecvf.com/content/CVPR2022/papers/Ran_Surface_Representation_for_Point_Clouds_CVPR_2022_paper.pdf) | [arXiv](http://arxiv.org/abs/2205.05740)
## News:
- (**Sep 10** NEW :fire:) We have uploaded the implementation of RepSurf on S3DIS along with its training log and pretrained weights.
- (**June 24** :fire:) We sucessfully finished our Oral presentation at CVPR 2022!
- (**May 11**) We have uploaded the implementation of RepSurf on ScanObjectNN along with its training log and pretrained weights.
## Tasks:
### We conduct experiments of different tasks on different codebases:
> Classification: **[3D Object Classification](./classification)**
> Segmentation: **[3D Semantic Segmentation](./segmentation)**
## Visualization
We provide several visualization results in the folder **./visualization** for a closer look at the construction of
RepSurf.
## License
RepSurf is under the Apache-2.0 license. Please contact the primary author **Haoxi Ran (ranhaoxi@gmail.com)** for
commercial use.
================================================
FILE: classification/README.md
================================================
# RepSurf for Classification
By *[Haoxi Ran\*](https://hancyran.github.io/) , Jun Liu, Chengjie Wang* ( * : corresponding contact)
### [PDF](https://openaccess.thecvf.com/content/CVPR2022/papers/Ran_Surface_Representation_for_Point_Clouds_CVPR_2022_paper.pdf) | [arXiv](http://arxiv.org/abs/2205.05740)
## Preparation
### Environment
We tested under the environment:
* python 3.7
* pytorch 1.6.0
* cuda 10.1
* gcc 7.2.0
* h5py
For anaconda user, initialize the conda environment **repsurf-cls** by:
```
sh init.sh
```
## Experiments
### ScanObjectNN (Data & Logs: [Google Drive](https://drive.google.com/drive/folders/1DGWT9W46MKVI0-lu18hJhB-R3BFVWuCs?usp=sharing))
* Performance:
* To download dataset:
```
wget https://download.cs.stanford.edu/orion/scanobjectnn/h5_files.zip
unzip h5_files.zip
ln -s [PATH]/h5_files data/ScanObjectNN
```
**Note**: We conduct all experiments on the hardest variant of ScanObjectNN (**PB_T50_RS**).
* To train **Umbrella RepSurf** on ScanObjectNN:
```
sh scripts/scanobjectnn/repsurf_ssg_umb.sh
```
* To train **Umbrella RepSurf (2x setting)** on ScanObjectNN:
```
sh scripts/scanobjectnn/repsurf_ssg_umb_2x.sh
```
## Acknowledgment
We use part of the library [pointops](https://github.com/hszhao/PointWeb/tree/master/lib/pointops)
from [PointWeb](https://github.com/hszhao/PointWeb).
## License
RepSurf is under the Apache-2.0 license. Please contact the primary author **Haoxi Ran (ranhaoxi@gmail.com)** for
commercial use.
================================================
FILE: classification/dataset/ScanObjectNNDataLoader.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import h5py
import warnings
from torch.utils.data import Dataset
warnings.filterwarnings('ignore')
class ScanObjectNNDataLoader(Dataset):
def __init__(self, root, split='training', bg=True):
self.root = root
assert (split == 'training' or split == 'test')
if bg:
print('Use data with background points')
dir_name = 'main_split'
else:
print('Use data without background points')
dir_name = 'main_split_nobg'
file_name = '_objectdataset_augmentedrot_scale75.h5'
h5_name = '{}/{}/{}'.format(self.root, dir_name, split + file_name)
with h5py.File(h5_name, mode="r") as f:
self.data = f['data'][:].astype('float32')
self.label = f['label'][:].astype('int64')
print('The size of %s data is %d' % (split, self.data.shape[0]))
def __len__(self):
return self.data.shape[0]
def __getitem__(self, index):
return self.data[index].T, self.label[index]
================================================
FILE: classification/dataset/__init__.py
================================================
================================================
FILE: classification/init.sh
================================================
#!/bin/sh
mkdir -p log/PointAnalysis/log/ScanObjectNN
mkdir -p data/
conda create -n repsurf-cls python=3.7 -y
conda activate repsurf-cls
conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 -c pytorch -c conda-forge -y
conda install -c anaconda h5py -y
cd modules/pointops
python3 setup.py install
cd -
================================================
FILE: classification/models/__init__.py
================================================
================================================
FILE: classification/models/repsurf/__init__.py
================================================
================================================
FILE: classification/models/repsurf/repsurf_ssg_umb.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch.nn as nn
import torch.nn.functional as F
from modules.repsurface_utils import SurfaceAbstractionCD, UmbrellaSurfaceConstructor
class Model(nn.Module):
def __init__(self, args):
super(Model, self).__init__()
center_channel = 0 if not args.return_center else (6 if args.return_polar else 3)
repsurf_channel = 10
self.init_nsample = args.num_point
self.return_dist = args.return_dist
self.surface_constructor = UmbrellaSurfaceConstructor(args.group_size + 1, repsurf_channel,
return_dist=args.return_dist, aggr_type=args.umb_pool,
cuda=args.cuda_ops)
self.sa1 = SurfaceAbstractionCD(npoint=512, radius=0.2, nsample=32, feat_channel=repsurf_channel,
pos_channel=center_channel, mlp=[64, 64, 128], group_all=False,
return_polar=args.return_polar, cuda=args.cuda_ops)
self.sa2 = SurfaceAbstractionCD(npoint=128, radius=0.4, nsample=64, feat_channel=128 + repsurf_channel,
pos_channel=center_channel, mlp=[128, 128, 256], group_all=False,
return_polar=args.return_polar, cuda=args.cuda_ops)
self.sa3 = SurfaceAbstractionCD(npoint=None, radius=None, nsample=None, feat_channel=256 + repsurf_channel,
pos_channel=center_channel, mlp=[256, 512, 1024], group_all=True,
return_polar=args.return_polar, cuda=args.cuda_ops)
# modelnet40
self.classfier = nn.Sequential(
nn.Linear(1024, 512),
nn.BatchNorm1d(512),
nn.ReLU(True),
nn.Dropout(0.4),
nn.Linear(512, 256),
nn.BatchNorm1d(256),
nn.ReLU(True),
nn.Dropout(0.4),
nn.Linear(256, args.num_class))
def forward(self, points):
# init
center = points[:, :3, :]
normal = self.surface_constructor(center)
center, normal, feature = self.sa1(center, normal, None)
center, normal, feature = self.sa2(center, normal, feature)
center, normal, feature = self.sa3(center, normal, feature)
feature = feature.view(-1, 1024)
feature = self.classfier(feature)
feature = F.log_softmax(feature, -1)
return feature
================================================
FILE: classification/models/repsurf/repsurf_ssg_umb_2x.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch.nn as nn
import torch.nn.functional as F
from modules.repsurface_utils import SurfaceAbstractionCD, UmbrellaSurfaceConstructor
class Model(nn.Module):
def __init__(self, args):
super(Model, self).__init__()
center_channel = 0 if not args.return_center else (6 if args.return_polar else 3)
repsurf_channel = 10
self.init_nsample = args.num_point
self.return_dist = args.return_dist
self.surface_constructor = UmbrellaSurfaceConstructor(args.group_size + 1, repsurf_channel,
return_dist=args.return_dist, aggr_type=args.umb_pool,
cuda=args.cuda_ops)
self.sa1 = SurfaceAbstractionCD(npoint=512, radius=0.1, nsample=24, feat_channel=repsurf_channel,
pos_channel=center_channel, mlp=[128, 128, 256], group_all=False,
return_polar=args.return_polar, cuda=args.cuda_ops)
self.sa2 = SurfaceAbstractionCD(npoint=128, radius=0.2, nsample=24, feat_channel=256 + repsurf_channel,
pos_channel=center_channel, mlp=[256, 256, 512], group_all=False,
return_polar=args.return_polar, cuda=args.cuda_ops)
self.sa3 = SurfaceAbstractionCD(npoint=32, radius=0.4, nsample=24, feat_channel=512 + repsurf_channel,
pos_channel=center_channel, mlp=[512, 512, 1024], group_all=False,
return_polar=args.return_polar, cuda=args.cuda_ops)
self.sa4 = SurfaceAbstractionCD(npoint=None, radius=None, nsample=None, feat_channel=1024 + repsurf_channel,
pos_channel=center_channel, mlp=[1024, 1024, 2048], group_all=True,
return_polar=args.return_polar, cuda=args.cuda_ops)
# modelnet40
self.classfier = nn.Sequential(
nn.Linear(2048, 512),
nn.BatchNorm1d(512),
nn.ReLU(True),
nn.Dropout(0.4),
nn.Linear(512, 256),
nn.BatchNorm1d(256),
nn.ReLU(True),
nn.Dropout(0.4),
nn.Linear(256, args.num_class))
def forward(self, points):
# init
center = points[:, :3, :]
normal = self.surface_constructor(center)
center, normal, feature = self.sa1(center, normal, None)
center, normal, feature = self.sa2(center, normal, feature)
center, normal, feature = self.sa3(center, normal, feature)
center, normal, feature = self.sa4(center, normal, feature)
feature = feature.view(-1, 2048)
feature = self.classfier(feature)
feature = F.log_softmax(feature, -1)
return feature
================================================
FILE: classification/modules/__init__.py
================================================
================================================
FILE: classification/modules/pointnet2_utils.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch
try:
from modules.pointops.functions.pointops import furthestsampling, gathering, ballquery, knnquery, \
grouping, interpolation, nearestneighbor
except:
raise Exception('Failed to load pointops')
def square_distance(src, dst):
"""
Calculate Squared distance between each two points.
"""
B, N, _ = src.shape
_, M, _ = dst.shape
dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))
dist += torch.sum(src ** 2, -1).view(B, N, 1)
dist += torch.sum(dst ** 2, -1).view(B, 1, M)
return dist
def index_points(points, idx, cuda=False, is_group=False):
if cuda:
if is_group:
points = grouping(points.transpose(1, 2).contiguous(), idx)
return points.permute(0, 2, 3, 1).contiguous()
else:
points = gathering(points.transpose(1, 2).contiguous(), idx)
return points.permute(0, 2, 1).contiguous()
device = points.device
B = points.shape[0]
view_shape = list(idx.shape)
view_shape[1:] = [1] * (len(view_shape) - 1)
repeat_shape = list(idx.shape)
repeat_shape[0] = 1
batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
new_points = points[batch_indices, idx, :]
return new_points
def farthest_point_sample(xyz, npoint, cuda=False):
"""
Input:
xyz: pointcloud data, [B, N, 3]
npoint: number of samples
Return:
centroids: sampled pointcloud index, [B, npoint]
FLOPs:
S * (3 + 3 + 2)
"""
if cuda:
if not xyz.is_contiguous():
xyz = xyz.contiguous()
return furthestsampling(xyz, npoint)
device = xyz.device
B, N, C = xyz.shape
centroids = torch.zeros(B, npoint, dtype=torch.long).to(device)
distance = torch.ones(B, N).to(device) * 1e10
farthest = torch.randint(0, N, (B,), dtype=torch.long).to(device)
batch_indices = torch.arange(B, dtype=torch.long).to(device)
for i in range(npoint):
centroids[:, i] = farthest
centroid = xyz[batch_indices, farthest, :].view(B, 1, 3)
dist = torch.sum((xyz - centroid) ** 2, -1)
mask = dist < distance
distance[mask] = dist[mask]
farthest = torch.max(distance, -1)[1]
return centroids
def query_ball_point(radius, nsample, xyz, new_xyz, debug=False, cuda=False):
if cuda:
if not xyz.is_contiguous():
xyz = xyz.contiguous()
if not new_xyz.is_contiguous():
new_xyz = new_xyz.contiguous()
return ballquery(radius, nsample, xyz, new_xyz)
device = xyz.device
B, N, C = xyz.shape
_, S, _ = new_xyz.shape
group_idx = torch.arange(N, dtype=torch.long).to(device).view(1, 1, N).repeat([B, S, 1])
sqrdists = square_distance(new_xyz, xyz)
group_idx[sqrdists > radius ** 2] = N
group_idx = group_idx.sort(dim=-1)[0][:, :, :nsample]
group_first = group_idx[:, :, 0].view(B, S, 1).repeat([1, 1, nsample])
mask = group_idx == N
group_idx[mask] = group_first[mask]
if debug:
num_miss = torch.sum(mask)
num_over = torch.sum(torch.clamp(torch.sum(sqrdists < radius ** 2, dim=2) - nsample, min=0))
return num_miss, num_over
return group_idx
def query_knn_point(k, xyz, new_xyz, cuda=False):
if cuda:
if not xyz.is_contiguous():
xyz = xyz.contiguous()
if not new_xyz.is_contiguous():
new_xyz = new_xyz.contiguous()
return knnquery(k, xyz, new_xyz)
dist = square_distance(new_xyz, xyz)
group_idx = dist.sort(descending=False, dim=-1)[1][:, :, :k]
return group_idx
def sample(nsample, feature, cuda=False):
feature = feature.permute(0, 2, 1)
xyz = feature[:, :, :3]
fps_idx = farthest_point_sample(xyz, nsample, cuda=cuda) # [B, npoint, C]
torch.cuda.empty_cache()
feature = index_points(feature, fps_idx, cuda=cuda, is_group=False)
torch.cuda.empty_cache()
feature = feature.permute(0, 2, 1)
return feature
================================================
FILE: classification/modules/pointops/__init__.py
================================================
================================================
FILE: classification/modules/pointops/functions/__init__.py
================================================
from .pointops import *
================================================
FILE: classification/modules/pointops/functions/pointops.py
================================================
from typing import Tuple
import numpy as np
import torch
from torch.autograd import Function
import torch.nn as nn
try:
import pointops_cuda
except ImportError:
import warnings
import os
from torch.utils.cpp_extension import load
warnings.warn("Unable to load pointops_cuda cpp extension.")
pointops_cuda_src = os.path.join(os.path.dirname(__file__), "../src")
pointops_cuda = load('pointops_cuda', [
pointops_cuda_src + '/pointops_api.cpp',
pointops_cuda_src + '/ballquery/ballquery_cuda.cpp',
pointops_cuda_src + '/ballquery/ballquery_cuda_kernel.cu',
pointops_cuda_src + '/knnquery/knnquery_cuda.cpp',
pointops_cuda_src + '/knnquery/knnquery_cuda_kernel.cu',
pointops_cuda_src + '/knnquery_heap/knnquery_heap_cuda.cpp',
pointops_cuda_src + '/knnquery_heap/knnquery_heap_cuda_kernel.cu',
pointops_cuda_src + '/grouping/grouping_cuda.cpp',
pointops_cuda_src + '/grouping/grouping_cuda_kernel.cu',
pointops_cuda_src + '/grouping_int/grouping_int_cuda.cpp',
pointops_cuda_src + '/grouping_int/grouping_int_cuda_kernel.cu',
pointops_cuda_src + '/interpolation/interpolation_cuda.cpp',
pointops_cuda_src + '/interpolation/interpolation_cuda_kernel.cu',
pointops_cuda_src + '/sampling/sampling_cuda.cpp',
pointops_cuda_src + '/sampling/sampling_cuda_kernel.cu'
], build_directory=pointops_cuda_src, verbose=False)
class FurthestSampling(Function):
@staticmethod
def forward(ctx, xyz, m):
"""
input: xyz: (b, n, 3) and n > m, m: int32
output: idx: (b, m)
"""
assert xyz.is_contiguous()
b, n, _ = xyz.size()
idx = torch.cuda.IntTensor(b, m)
temp = torch.cuda.FloatTensor(b, n).fill_(1e10)
pointops_cuda.furthestsampling_cuda(b, n, m, xyz, temp, idx)
return idx
@staticmethod
def backward(xyz, a=None):
return None, None
furthestsampling = FurthestSampling.apply
class Gathering(Function):
@staticmethod
def forward(ctx, features, idx):
"""
input: features: (b, c, n), idx : (b, m) tensor
output: (b, c, m)
"""
assert features.is_contiguous()
assert idx.is_contiguous()
b, c, n = features.size()
m = idx.size(1)
output = torch.cuda.FloatTensor(b, c, m)
pointops_cuda.gathering_forward_cuda(b, c, n, m, features, idx, output)
ctx.for_backwards = (idx, c, n)
return output
@staticmethod
def backward(ctx, grad_out):
idx, c, n = ctx.for_backwards
b, m = idx.size()
grad_features = torch.cuda.FloatTensor(b, c, n).zero_()
grad_out_data = grad_out.data.contiguous()
pointops_cuda.gathering_backward_cuda(b, c, n, m, grad_out_data, idx, grad_features.data)
return grad_features, None
gathering = Gathering.apply
class NearestNeighbor(Function):
@staticmethod
def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Find the three nearest neighbors of unknown in known
input: unknown: (b, n, 3), known: (b, m, 3)
output: dist2: (b, n, 3) l2 distance to the three nearest neighbors
idx: (b, n, 3) index of 3 nearest neighbors
"""
assert unknown.is_contiguous()
assert known.is_contiguous()
b, n, _ = unknown.size()
m = known.size(1)
dist2 = torch.cuda.FloatTensor(b, n, 3)
idx = torch.cuda.IntTensor(b, n, 3)
pointops_cuda.nearestneighbor_cuda(b, n, m, unknown, known, dist2, idx)
return torch.sqrt(dist2), idx
@staticmethod
def backward(ctx, a=None, b=None):
return None, None
nearestneighbor = NearestNeighbor.apply
class Interpolation(Function):
@staticmethod
def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
"""
Performs weight linear interpolation on 3 features
input: features: (b, c, m) features descriptors to be interpolated from
idx: (b, n, 3) three nearest neighbors of the target features in features
weight: (b, n, 3) weights
output: (b, c, n) tensor of the interpolated features
"""
features = features.contiguous()
assert features.is_contiguous()
assert idx.is_contiguous()
assert weight.is_contiguous()
b, c, m = features.size()
n = idx.size(1)
ctx.interpolation_for_backward = (idx, weight, m)
output = torch.cuda.FloatTensor(b, c, n)
pointops_cuda.interpolation_forward_cuda(b, c, m, n, features, idx, weight, output)
return output
@staticmethod
def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
input: grad_out: (b, c, n)
output: grad_features: (b, c, m), None, None
"""
idx, weight, m = ctx.interpolation_for_backward
b, c, n = grad_out.size()
grad_features = torch.cuda.FloatTensor(b, c, m).zero_()
grad_out_data = grad_out.data.contiguous()
pointops_cuda.interpolation_backward_cuda(b, c, n, m, grad_out_data, idx, weight, grad_features.data)
return grad_features, None, None
interpolation = Interpolation.apply
class Grouping(Function):
@staticmethod
def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
"""
input: features: (b, c, n), idx : (b, m, nsample) containing the indicies of features to group with
output: (b, c, m, nsample)
"""
assert features.is_contiguous()
assert idx.is_contiguous()
b, c, n = features.size()
_, m, nsample = idx.size()
output = torch.cuda.FloatTensor(b, c, m, nsample)
pointops_cuda.grouping_forward_cuda(b, c, n, m, nsample, features, idx, output)
ctx.for_backwards = (idx, n)
return output
@staticmethod
def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
input: grad_out: (b, c, m, nsample)
output: (b, c, n), None
"""
idx, n = ctx.for_backwards
b, c, m, nsample = grad_out.size()
grad_features = torch.cuda.FloatTensor(b, c, n).zero_()
grad_out_data = grad_out.data.contiguous()
pointops_cuda.grouping_backward_cuda(b, c, n, m, nsample, grad_out_data, idx, grad_features.data)
return grad_features, None
grouping = Grouping.apply
class GroupingInt(Function):
@staticmethod
def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
"""
input: features: (b, c, n), idx : (b, m, nsample) containing the indicies of features to group with
output: (b, c, m, nsample)
"""
assert features.is_contiguous()
assert idx.is_contiguous()
b, c, n = features.size()
_, m, nsample = idx.size()
output = torch.cuda.LongTensor(b, c, m, nsample)
pointops_cuda.grouping_int_forward_cuda(b, c, n, m, nsample, features, idx, output)
return output
@staticmethod
def backward(ctx, a=None):
return None, None
grouping_int = GroupingInt.apply
class BallQuery(Function):
@staticmethod
def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor:
"""
input: radius: float, radius of the balls
nsample: int, maximum number of features in the balls
xyz: torch.Tensor, (b, n, 3) xyz coordinates of the features
new_xyz: torch.Tensor, (b, m, 3) centers of the ball query
output: (b, m, nsample) tensor with the indicies of the features that form the query balls
"""
assert xyz.is_contiguous()
assert new_xyz.is_contiguous()
b, n, _ = xyz.size()
m = new_xyz.size(1)
idx = torch.cuda.IntTensor(b, m, nsample).zero_()
pointops_cuda.ballquery_cuda(b, n, m, radius, nsample, new_xyz, xyz, idx)
return idx
@staticmethod
def backward(ctx, a=None):
return None, None, None, None
ballquery = BallQuery.apply
def pairwise_distances(x, y=None):
'''
Input: x is a Nxd matrix
y is an optional Mxd matirx
Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
if y is not given then use 'y=x'.
i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
'''
x_norm = (x ** 2).sum(1).view(-1, 1)
if y is not None:
y_t = torch.transpose(y, 0, 1)
y_norm = (y ** 2).sum(1).view(1, -1)
else:
y_t = torch.transpose(x, 0, 1)
y_norm = x_norm.view(1, -1)
dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
import numpy as np
return torch.clamp(dist, 0.0, np.inf)
class KNNQueryNaive(Function):
@staticmethod
def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]:
"""
KNN Indexing
input: nsample: int32, Number of neighbor
xyz: (b, n, 3) coordinates of the features
new_xyz: (b, m, 3) centriods
output: idx: (b, m, nsample)
"""
if new_xyz is None:
new_xyz = xyz
b, m, _ = new_xyz.size()
n = xyz.size(1)
'''
idx = torch.zeros(b, m, nsample).int().cuda()
for i in range(b):
dist = pairwise_distances(new_xyz[i, :, :], xyz[i, :, :])
[_, idxs] = torch.sort(dist, dim=1)
idx[i, :, :] = idxs[:, 0:nsample]
'''
# '''
# new_xyz_repeat = new_xyz.repeat(1, 1, n).view(b, m * n, 3)
# xyz_repeat = xyz.repeat(1, m, 1).view(b, m * n, 3)
# dist = (new_xyz_repeat - xyz_repeat).pow(2).sum(dim=2).view(b, m, n)
dist = (new_xyz.repeat(1, 1, n).view(b, m * n, 3) - xyz.repeat(1, m, 1).view(b, m * n, 3)).pow(2).sum(
dim=2).view(b, m, n)
[_, idxs] = torch.sort(dist, dim=2)
idx = idxs[:, :, 0:nsample].int()
# '''
return idx
@staticmethod
def backward(ctx):
return None, None, None
knnquery_naive = KNNQueryNaive.apply
class KNNQuery(Function):
@staticmethod
def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]:
"""
KNN Indexing
input: nsample: int32, Number of neighbor
xyz: (b, n, 3) coordinates of the features
new_xyz: (b, m, 3) centriods
output: idx: (b, m, nsample)
( dist2: (b, m, nsample) )
"""
if new_xyz is None:
new_xyz = xyz
xyz = xyz.contiguous()
new_xyz = new_xyz.contiguous()
assert xyz.is_contiguous()
assert new_xyz.is_contiguous()
b, m, _ = new_xyz.size()
n = xyz.size(1)
idx = torch.cuda.IntTensor(b, m, nsample).zero_()
dist2 = torch.cuda.FloatTensor(b, m, nsample).zero_()
pointops_cuda.knnquery_cuda(b, n, m, nsample, xyz, new_xyz, idx, dist2)
return idx
@staticmethod
def backward(ctx, a=None):
return None, None, None
knnquery = KNNQuery.apply
class KNNQuery_Heap(Function):
@staticmethod
def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]:
"""
KNN Indexing
input: nsample: int32, Number of neighbor
xyz: (b, n, 3) coordinates of the features
new_xyz: (b, m, 3) centriods
output: idx: (b, m, nsample)
( dist2: (b, m, nsample) )
"""
if new_xyz is None:
new_xyz = xyz
assert xyz.is_contiguous()
assert new_xyz.is_contiguous()
b, m, _ = new_xyz.size()
n = xyz.size(1)
idx = torch.cuda.IntTensor(b, m, nsample).zero_()
dist2 = torch.cuda.FloatTensor(b, m, nsample).zero_()
pointops_cuda.knnquery_heap_cuda(b, n, m, nsample, xyz, new_xyz, idx, dist2)
ctx.mark_non_differentiable(idx)
return idx
@staticmethod
def backward(ctx, a=None):
return None, None, None
knnquery_heap = KNNQuery_Heap.apply
class QueryAndGroup(nn.Module):
"""
Groups with a ball query of radius
parameters:
radius: float32, Radius of ball
nsample: int32, Maximum number of features to gather in the ball
"""
def __init__(self, radius=None, nsample=32, use_xyz=True, return_idx=False):
super(QueryAndGroup, self).__init__()
self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
self.return_idx = return_idx
def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor = None, features: torch.Tensor = None,
idx: torch.Tensor = None) -> torch.Tensor:
"""
input: xyz: (b, n, 3) coordinates of the features
new_xyz: (b, m, 3) centriods
features: (b, c, n)
idx: idx of neighbors
# idxs: (b, n)
output: new_features: (b, c+3, m, nsample)
# grouped_idxs: (b, m, nsample)
"""
if new_xyz is None:
new_xyz = xyz
if idx is None:
if self.radius is not None:
idx = ballquery(self.radius, self.nsample, xyz, new_xyz)
else:
# idx = knnquery_naive(self.nsample, xyz, new_xyz) # (b, m, nsample)
# idx = knnquery(self.nsample, xyz, new_xyz) # (b, m, nsample)
idx = knnquery_heap(self.nsample, xyz, new_xyz) # (b, m, nsample)
xyz_trans = xyz.transpose(1, 2).contiguous()
grouped_xyz = grouping(xyz_trans, idx) # (b, 3, m, nsample)
# grouped_idxs = grouping(idxs.unsqueeze(1).float(), idx).squeeze(1).int() # (b, m, nsample)
grouped_xyz_diff = grouped_xyz - new_xyz.transpose(1, 2).unsqueeze(-1)
if features is not None:
grouped_features = grouping(features, idx)
if self.use_xyz:
new_features = torch.cat([grouped_xyz_diff, grouped_features], dim=1) # (b, 3+c, m, nsample)
else:
new_features = grouped_features
else:
assert self.use_xyz, "Cannot have not features and not use xyz as a feature!"
new_features = grouped_xyz_diff
if self.return_idx:
return new_features, grouped_xyz, idx.long()
# (b,c,m,k), (b,3,m,k), (b,m,k)
else:
return new_features, grouped_xyz
================================================
FILE: classification/modules/pointops/setup.py
================================================
#python3 setup.py install
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import os
from distutils.sysconfig import get_config_vars
(opt,) = get_config_vars('OPT')
os.environ['OPT'] = " ".join(
flag for flag in opt.split() if flag != '-Wstrict-prototypes'
)
setup(
name='pointops',
ext_modules=[
CUDAExtension('pointops_cuda', [
'src/pointops_api.cpp',
'src/ballquery/ballquery_cuda.cpp',
'src/ballquery/ballquery_cuda_kernel.cu',
'src/knnquery/knnquery_cuda.cpp',
'src/knnquery/knnquery_cuda_kernel.cu',
'src/knnquery_heap/knnquery_heap_cuda.cpp',
'src/knnquery_heap/knnquery_heap_cuda_kernel.cu',
'src/grouping/grouping_cuda.cpp',
'src/grouping/grouping_cuda_kernel.cu',
'src/grouping_int/grouping_int_cuda.cpp',
'src/grouping_int/grouping_int_cuda_kernel.cu',
'src/interpolation/interpolation_cuda.cpp',
'src/interpolation/interpolation_cuda_kernel.cu',
'src/sampling/sampling_cuda.cpp',
'src/sampling/sampling_cuda_kernel.cu',
],
extra_compile_args={'cxx': ['-g'],
'nvcc': ['-O2']})
],
cmdclass={'build_ext': BuildExtension})
================================================
FILE: classification/modules/pointops/src/__init__.py
================================================
================================================
FILE: classification/modules/pointops/src/ballquery/ballquery_cuda.cpp
================================================
#include
#include
#include
#include
#include "ballquery_cuda_kernel.h"
extern THCState *state;
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor)
{
const float *new_xyz = new_xyz_tensor.data_ptr();
const float *xyz = xyz_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
ballquery_cuda_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx);
}
void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor)
{
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
const float *new_xyz = new_xyz_tensor.data_ptr();
const float *xyz = xyz_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
ballquery_cuda_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream);
}
================================================
FILE: classification/modules/pointops/src/ballquery/ballquery_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "ballquery_cuda_kernel.h"
// input: new_xyz(b, m, 3) xyz(b, n, 3)
// output: idx(b, m, nsample)
__global__ void ballquery_cuda_kernel(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx)
{
int batch_index = blockIdx.x;
xyz += batch_index * n * 3;
new_xyz += batch_index * m * 3;
idx += m * nsample * batch_index;
int index = threadIdx.x;
int stride = blockDim.x;
float radius2 = radius * radius;
for (int j = index; j < m; j += stride)
{
float new_x = new_xyz[j * 3 + 0];
float new_y = new_xyz[j * 3 + 1];
float new_z = new_xyz[j * 3 + 2];
for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k)
{
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
if (d2 < radius2)
{
if (cnt == 0)
{
for (int l = 0; l < nsample; ++l)
idx[j * nsample + l] = k;
}
idx[j * nsample + cnt] = k;
++cnt;
}
}
}
}
void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx)
{
ballquery_cuda_kernel<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
}
__global__ void ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
float radius2 = radius * radius;
float new_x = new_xyz[0];
float new_y = new_xyz[1];
float new_z = new_xyz[2];
int cnt = 0;
for (int k = 0; k < n; ++k) {
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
if (d2 < radius2){
if (cnt == 0){
for (int l = 0; l < nsample; ++l) {
idx[l] = k;
}
}
idx[cnt] = k;
++cnt;
if (cnt >= nsample){
break;
}
}
}
}
void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) {
// param new_xyz: (B, m, 3)
// param xyz: (B, n, 3)
// param idx: (B, m, nsample)
cudaError_t err;
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
ballquery_cuda_kernel_fast<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: classification/modules/pointops/src/ballquery/ballquery_cuda_kernel.h
================================================
#ifndef _BALLQUERY_CUDA_KERNEL
#define _BALLQUERY_CUDA_KERNEL
#include
#include
#include
void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *xyz, const float *new_xyz, int *idx);
void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/cuda_utils.h
================================================
#ifndef _CUDA_UTILS_H
#define _CUDA_UTILS_H
#include
#define TOTAL_THREADS 1024
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
inline int opt_n_threads(int work_size) {
const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0);
return max(min(1 << pow_2, TOTAL_THREADS), 1);
}
inline dim3 opt_block_config(int x, int y) {
const int x_threads = opt_n_threads(x);
const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
dim3 block_config(x_threads, y_threads, 1);
return block_config;
}
#endif
================================================
FILE: classification/modules/pointops/src/grouping/grouping_cuda.cpp
================================================
#include
#include
#include
#include
#include "grouping_cuda_kernel.h"
extern THCState *state;
void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
{
const float *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *out = out_tensor.data_ptr();
grouping_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out);
}
void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor)
{
float *grad_points = grad_points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
const float *grad_out = grad_out_tensor.data_ptr();
grouping_backward_cuda_launcher(b, c, n, m, nsample, grad_out, idx, grad_points);
}
void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *out = out_tensor.data_ptr();
grouping_forward_cuda_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
}
================================================
FILE: classification/modules/pointops/src/grouping/grouping_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "grouping_cuda_kernel.h"
// input: points(b, c, n) idx(b, m, nsample)
// output: out(b, c, m, nsample)
__global__ void grouping_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out)
{
int batch_index = blockIdx.x;
points += batch_index * n * c;
idx += batch_index * m * nsample;
out += batch_index * m * nsample * c;
const int index = threadIdx.y * blockDim.x + threadIdx.x;
const int stride = blockDim.y * blockDim.x;
for (int i = index; i < c * m; i += stride)
{
const int l = i / m;
const int j = i % m;
for (int k = 0; k < nsample; ++k)
{
int ii = idx[j * nsample + k];
out[(l * m + j) * nsample + k] = points[l * n + ii];
}
}
}
// input: grad_out(b, c, m, nsample), idx(b, m, nsample)
// output: grad_points(b, c, n)
__global__ void grouping_backward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points)
{
int batch_index = blockIdx.x;
grad_out += batch_index * m * nsample * c;
idx += batch_index * m * nsample;
grad_points += batch_index * n * c;
const int index = threadIdx.y * blockDim.x + threadIdx.x;
const int stride = blockDim.y * blockDim.x;
for (int i = index; i < c * m; i += stride)
{
const int l = i / m;
const int j = i % m;
for (int k = 0; k < nsample; ++k)
{
int ii = idx[j * nsample + k];
atomicAdd(grad_points + l * n + ii, grad_out[(l * m + j) * nsample + k]);
}
}
}
void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out)
{
grouping_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out);
}
void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points)
{
grouping_backward_cuda_kernel<<>>(b, c, n, m, nsample, grad_out, idx, grad_points);
}
// input: points(b, c, n) idx(b, npoints, nsample)
// output: out(b, c, npoints, nsample)
__global__ void grouping_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
}
// input: points(b, c, n) idx(b, npoints, nsample)
// output: out(b, c, npoints, nsample)
void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out) {
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
grouping_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: classification/modules/pointops/src/grouping/grouping_cuda_kernel.h
================================================
#ifndef _GROUPING_CUDA_KERNEL
#define _GROUPING_CUDA_KERNEL
#include
#include
#include
void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out);
void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out);
void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points);
void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/grouping_int/grouping_int_cuda.cpp
================================================
#include
#include
#include
#include
#include "grouping_int_cuda_kernel.h"
extern THCState *state;
void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
{
const long int *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
long int *out = out_tensor.data_ptr();
grouping_int_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out);
}
void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
{
const long int *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
long int *out = out_tensor.data_ptr();
grouping_int_forward_cuda_launcher_fast(b, c, n, m, nsample, points, idx, out);
}
================================================
FILE: classification/modules/pointops/src/grouping_int/grouping_int_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "grouping_int_cuda_kernel.h"
// input: points(b, c, n) idx(b, m, nsample)
// output: out(b, c, m, nsample)
__global__ void grouping_int_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out)
{
int batch_index = blockIdx.x;
points += batch_index * n * c;
idx += batch_index * m * nsample;
out += batch_index * m * nsample * c;
const int index = threadIdx.y * blockDim.x + threadIdx.x;
const int stride = blockDim.y * blockDim.x;
for (int i = index; i < c * m; i += stride)
{
const int l = i / m;
const int j = i % m;
for (int k = 0; k < nsample; ++k)
{
int ii = idx[j * nsample + k];
out[(l * m + j) * nsample + k] = points[l * n + ii];
}
}
}
void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out)
{
grouping_int_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out);
}
__global__ void grouping_int_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const long int *__restrict__ points, const int *__restrict__ idx, long int *__restrict__ out)
{
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
}
void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out)
{
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
grouping_int_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: classification/modules/pointops/src/grouping_int/grouping_int_cuda_kernel.h
================================================
#ifndef _GROUPING_INT_CUDA_KERNEL
#define _GROUPING_INT_CUDA_KERNEL
#include
#include
#include
void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out);
void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out);
void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/interpolation/interpolation_cuda.cpp
================================================
#include
#include
#include
#include
#include "interpolation_cuda_kernel.h"
extern THCState *state;
void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor)
{
const float *unknown = unknown_tensor.data_ptr();
const float *known = known_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
nearestneighbor_cuda_launcher(b, n, m, unknown, known, dist2, idx);
}
void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor)
{
const float *points = points_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *out = out_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
interpolation_forward_cuda_launcher(b, c, m, n, points, idx, weight, out);
}
void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor)
{
const float *grad_out = grad_out_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *grad_points = grad_points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
interpolation_backward_cuda_launcher(b, c, n, m, grad_out, idx, weight, grad_points);
}
void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
const float *unknown = unknown_tensor.data_ptr();
const float *known = known_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
nearestneighbor_cuda_launcher_fast(b, n, m, unknown, known, dist2, idx);
}
void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *out = out_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
interpolation_forward_cuda_launcher_fast(b, c, m, n, points, idx, weight, out);
}
================================================
FILE: classification/modules/pointops/src/interpolation/interpolation_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "interpolation_cuda_kernel.h"
// input: unknown(b, n, 3) known(b, m, 3)
// output: dist2(b, n, 3), idx(b, n, 3)
__global__ void nearestneighbor_cuda_kernel(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx)
{
int batch_index = blockIdx.x;
unknown += batch_index * n * 3;
known += batch_index * m * 3;
dist2 += batch_index * n * 3;
idx += batch_index * n * 3;
int index = threadIdx.x;
int stride = blockDim.x;
for (int j = index; j < n; j += stride)
{
float ux = unknown[j * 3 + 0];
float uy = unknown[j * 3 + 1];
float uz = unknown[j * 3 + 2];
double best1 = 1e40, best2 = 1e40, best3 = 1e40;
int besti1 = 0, besti2 = 0, besti3 = 0;
for (int k = 0; k < m; ++k)
{
float x = known[k * 3 + 0];
float y = known[k * 3 + 1];
float z = known[k * 3 + 2];
float d =
(ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
if (d < best1)
{
best3 = best2;
besti3 = besti2;
best2 = best1;
besti2 = besti1;
best1 = d;
besti1 = k;
}
else if (d < best2)
{
best3 = best2;
besti3 = besti2;
best2 = d;
besti2 = k;
}
else if (d < best3)
{
best3 = d;
besti3 = k;
}
}
dist2[j * 3 + 0] = best1;
dist2[j * 3 + 1] = best2;
dist2[j * 3 + 2] = best3;
idx[j * 3 + 0] = besti1;
idx[j * 3 + 1] = besti2;
idx[j * 3 + 2] = besti3;
}
}
// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
// output: out(b, c, n)
__global__ void interpolation_forward_cuda_kernel(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out)
{
int batch_index = blockIdx.x;
points += batch_index * m * c;
idx += batch_index * n * 3;
weight += batch_index * n * 3;
out += batch_index * n * c;
const int index = threadIdx.y * blockDim.x + threadIdx.x;
const int stride = blockDim.y * blockDim.x;
for (int i = index; i < c * n; i += stride)
{
const int l = i / n;
const int j = i % n;
float w1 = weight[j * 3 + 0];
float w2 = weight[j * 3 + 1];
float w3 = weight[j * 3 + 2];
int i1 = idx[j * 3 + 0];
int i2 = idx[j * 3 + 1];
int i3 = idx[j * 3 + 2];
out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + points[l * m + i3] * w3;
}
}
// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
// output: grad_points(b, c, m)
__global__ void interpolation_backward_cuda_kernel( int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points)
{
int batch_index = blockIdx.x;
grad_out += batch_index * n * c;
idx += batch_index * n * 3;
weight += batch_index * n * 3;
grad_points += batch_index * m * c;
const int index = threadIdx.y * blockDim.x + threadIdx.x;
const int stride = blockDim.y * blockDim.x;
for (int i = index; i < c * n; i += stride)
{
const int l = i / n;
const int j = i % n;
float w1 = weight[j * 3 + 0];
float w2 = weight[j * 3 + 1];
float w3 = weight[j * 3 + 2];
int i1 = idx[j * 3 + 0];
int i2 = idx[j * 3 + 1];
int i3 = idx[j * 3 + 2];
atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
}
}
void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx)
{
nearestneighbor_cuda_kernel<<>>(b, n, m, unknown, known, dist2, idx);
}
void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out)
{
interpolation_forward_cuda_kernel<<>>(b, c, m, n, points, idx, weight, out);
}
void interpolation_backward_cuda_launcher(int b, int n, int c, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points)
{
interpolation_backward_cuda_kernel<<>>(b, n, c, m, grad_out, idx, weight, grad_points);
}
// input: unknown(b, n, 3) known(b, m, 3)
// output: dist2(b, n, 3), idx(b, n, 3)
__global__ void nearestneighbor_cuda_kernel_fast(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= n) return;
unknown += bs_idx * n * 3 + pt_idx * 3;
known += bs_idx * m * 3;
dist2 += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3;
float ux = unknown[0];
float uy = unknown[1];
float uz = unknown[2];
double best1 = 1e40, best2 = 1e40, best3 = 1e40;
int besti1 = 0, besti2 = 0, besti3 = 0;
for (int k = 0; k < m; ++k) {
float x = known[k * 3 + 0];
float y = known[k * 3 + 1];
float z = known[k * 3 + 2];
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
if (d < best1) {
best3 = best2; besti3 = besti2;
best2 = best1; besti2 = besti1;
best1 = d; besti1 = k;
}
else if (d < best2) {
best3 = best2; besti3 = besti2;
best2 = d; besti2 = k;
}
else if (d < best3) {
best3 = d; besti3 = k;
}
}
dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
}
// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
// output: out(b, c, n)
__global__ void interpolation_forward_cuda_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
weight += bs_idx * n * 3 + pt_idx * 3;
points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
out += bs_idx * c * n + c_idx * n;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
}
void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx)
{
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
nearestneighbor_cuda_kernel_fast<<>>(b, n, m, unknown, known, dist2, idx);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out) {
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
interpolation_forward_cuda_kernel_fast<<>>(b, c, m, n, points, idx, weight, out);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n",
cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: classification/modules/pointops/src/interpolation/interpolation_cuda_kernel.h
================================================
#ifndef _INTERPOLATION_CUDA_KERNEL
#define _INTERPOLATION_CUDA_KERNEL
#include
#include
#include
void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx);
void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out);
void interpolation_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points);
void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx);
void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/knnquery/__init__.py
================================================
================================================
FILE: classification/modules/pointops/src/knnquery/knnquery_cuda.cpp
================================================
#include
#include
#include
#include
#include "knnquery_cuda_kernel.h"
extern THCState *state;
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
{
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
const float *new_xyz = new_xyz_tensor.data_ptr();
const float *xyz = xyz_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
knnquery_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
}
================================================
FILE: classification/modules/pointops/src/knnquery/knnquery_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "knnquery_cuda_kernel.h"
// input: xyz (b, n, 3) new_xyz (b, m, 3)
// output: idx (b, m, nsample) dist2 (b, m, nsample)
__global__ void knnquery_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
float new_x = new_xyz[0];
float new_y = new_xyz[1];
float new_z = new_xyz[2];
//double* best = new double[nsample];
//int* besti = new int[nsample];
double best[200];
int besti[200];
for(int i = 0; i < nsample; i++){
best[i] = 1e40;
besti[i] = 0;
}
for(int k = 0; k < n; k++){
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
for(int j = 0; j < nsample; j++){
if(d2 < best[j]){
for(int i = nsample - 1; i > j; i--){
best[i] = best[i - 1];
besti[i] = besti[i - 1];
}
best[j] = d2;
besti[j] = k;
break;
}
}
}
for(int i = 0; i < nsample; i++){
idx[i] = besti[i];
dist2[i] = best[i];
}
//delete []best;
//delete []besti;
}
void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
// param new_xyz: (B, m, 3)
// param xyz: (B, n, 3)
// param idx: (B, m, nsample)
cudaError_t err;
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
// fprintf('%d, %d', blocks, threads);
knnquery_cuda_kernel<<>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
// cudaDeviceSynchronize(); // for using printf in kernel function
// err = cudaGetLastError();
// if (cudaSuccess != err) {
// fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
// exit(-1);
// }
}
================================================
FILE: classification/modules/pointops/src/knnquery/knnquery_cuda_kernel.h
================================================
#ifndef _KNNQUERY_CUDA_KERNEL
#define _KNNQUERY_CUDA_KERNEL
#include
#include
#include
void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/knnquery_heap/__init__.py
================================================
================================================
FILE: classification/modules/pointops/src/knnquery_heap/knnquery_heap_cuda.cpp
================================================
#include
#include
#include
#include
#include "knnquery_heap_cuda_kernel.h"
extern THCState *state;
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
{
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
const float *new_xyz = new_xyz_tensor.data_ptr();
const float *xyz = xyz_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
knnquery_heap_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream);
}
================================================
FILE: classification/modules/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "knnquery_heap_cuda_kernel.h"
__device__ void swap_float(float *x, float *y)
{
float tmp = *x;
*x = *y;
*y = tmp;
}
__device__ void swap_int(int *x, int *y)
{
int tmp = *x;
*x = *y;
*y = tmp;
}
__device__ void reheap(float *dist, int *idx, int k)
{
int root = 0;
int child = root * 2 + 1;
while (child < k)
{
if(child + 1 < k && dist[child+1] > dist[child])
child++;
if(dist[root] > dist[child])
return;
swap_float(&dist[root], &dist[child]);
swap_int(&idx[root], &idx[child]);
root = child;
child = root * 2 + 1;
}
}
__device__ void heap_sort(float *dist, int *idx, int k)
{
int i;
for (i = k - 1; i > 0; i--)
{
swap_float(&dist[0], &dist[i]);
swap_int(&idx[0], &idx[i]);
reheap(dist, idx, i);
}
}
// input: xyz (b, n, 3) new_xyz (b, m, 3)
// output: idx (b, m, nsample) dist2 (b, m, nsample)
__global__ void knnquery_heap_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) {
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
dist2 += bs_idx * m * nsample + pt_idx * nsample;
float new_x = new_xyz[0];
float new_y = new_xyz[1];
float new_z = new_xyz[2];
float best_dist[100];
int best_idx[100];
for(int i = 0; i < nsample; i++){
best_dist[i] = 1e10;
best_idx[i] = 0;
}
for(int i = 0; i < n; i++){
float x = xyz[i * 3 + 0];
float y = xyz[i * 3 + 1];
float z = xyz[i * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
if (d2 < best_dist[0]){
best_dist[0] = d2;
best_idx[0] = i;
reheap(best_dist, best_idx, nsample);
}
}
heap_sort(best_dist, best_idx, nsample);
for(int i = 0; i < nsample; i++){
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
}
}
void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) {
// param new_xyz: (B, m, 3)
// param xyz: (B, n, 3)
// param idx: (B, m, nsample)
cudaError_t err;
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
knnquery_heap_cuda_kernel<<>>(b, n, m, nsample, xyz, new_xyz, idx, dist2);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: classification/modules/pointops/src/knnquery_heap/knnquery_heap_cuda_kernel.h
================================================
#ifndef _KNNQUERY_HEAP_CUDA_KERNEL
#define _KNNQUERY_HEAP_CUDA_KERNEL
#include
#include
#include
void knnquery_heap_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void knnquery_heap_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/pointops/src/pointops_api.cpp
================================================
#include
#include
#include "ballquery/ballquery_cuda_kernel.h"
#include "grouping/grouping_cuda_kernel.h"
#include "grouping_int/grouping_int_cuda_kernel.h"
#include "sampling/sampling_cuda_kernel.h"
#include "interpolation/interpolation_cuda_kernel.h"
#include "knnquery/knnquery_cuda_kernel.h"
#include "knnquery_heap/knnquery_heap_cuda_kernel.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ballquery_cuda", &ballquery_cuda_fast, "ballquery_cuda_fast"); // name in python, cpp function address, docs
m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda");
m.def("knnquery_heap_cuda", &knnquery_heap_cuda, "knnquery_heap_cuda");
m.def("grouping_forward_cuda", &grouping_forward_cuda_fast, "grouping_forward_cuda_fast");
m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
m.def("grouping_int_forward_cuda", &grouping_int_forward_cuda_fast, "grouping_int_forward_cuda_fast");
m.def("gathering_forward_cuda", &gathering_forward_cuda, "gathering_forward_cuda");
m.def("gathering_backward_cuda", &gathering_backward_cuda, "gathering_backward_cuda");
m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda");
m.def("nearestneighbor_cuda", &nearestneighbor_cuda_fast, "nearestneighbor_cuda_fast");
m.def("interpolation_forward_cuda", &interpolation_forward_cuda_fast, "interpolation_forward_cuda_fast");
m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
}
================================================
FILE: classification/modules/pointops/src/sampling/sampling_cuda.cpp
================================================
#include
#include
#include
#include
#include "sampling_cuda_kernel.h"
extern THCState *state;
void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor)
{
const float *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *out = out_tensor.data_ptr();
gathering_forward_cuda_launcher(b, c, n, m, points, idx, out);
}
void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor)
{
const float *grad_out = grad_out_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *grad_points = grad_points_tensor.data_ptr();
gathering_backward_cuda_launcher(b, c, n, m, grad_out, idx, grad_points);
}
void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor)
{
const float *points = points_tensor.data_ptr();
float *temp = temp_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
furthestsampling_cuda_launcher(b, n, m, points, temp, idx);
}
================================================
FILE: classification/modules/pointops/src/sampling/sampling_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "sampling_cuda_kernel.h"
// input: points(b, c, n) idx(b, m)
// output: out(b, c, m)
__global__ void gathering_forward_cuda_kernel(int b, int c, int n, int m, const float *points, const int *idx, float *out)
{
for (int i = blockIdx.x; i < b; i += gridDim.x)
{
for (int l = blockIdx.y; l < c; l += gridDim.y)
{
for (int j = threadIdx.x; j < m; j += blockDim.x)
{
int a = idx[i * m + j];
out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
}
}
}
}
// input: grad_out(b, c, m) idx(b, m)
// output: grad_points(b, c, n)
__global__ void gathering_backward_cuda_kernel(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points)
{
for (int i = blockIdx.x; i < b; i += gridDim.x)
{
for (int l = blockIdx.y; l < c; l += gridDim.y)
{
for (int j = threadIdx.x; j < m; j += blockDim.x)
{
int a = idx[i * m + j];
atomicAdd(grad_points + (i * c + l) * n + a, grad_out[(i * c + l) * m + j]);
}
}
}
}
void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out)
{
gathering_forward_cuda_kernel<<>>(b, c, n, m, points, idx, out);
}
void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points)
{
gathering_backward_cuda_kernel<<>>(b, c, n, m, grad_out, idx, grad_points);
}
__device__ void __update(float *dists, int *dists_i,
int idx1, int idx2) {
const float v1 = dists[idx1], v2 = dists[idx2];
const int i1 = dists_i[idx1], i2 = dists_i[idx2];
dists[idx1] = max(v1, v2);
dists_i[idx1] = v2 > v1 ? i2 : i1;
}
// Input dataset: (b, n, 3), tmp: (b, n)
// Ouput idxs (b, m)
template
__global__ void furthestsampling_cuda_kernel(int b, int n, int m, const float *dataset, float *temp, int *idxs)
{
if (m <= 0)
return;
__shared__ float dists[block_size];
__shared__ int dists_i[block_size];
int batch_index = blockIdx.x;
dataset += batch_index * n * 3;
temp += batch_index * n;
idxs += batch_index * m;
int tid = threadIdx.x;
const int stride = block_size;
int old = 0;
if (threadIdx.x == 0)
idxs[0] = old;
__syncthreads();
for (int j = 1; j < m; j++)
{
int besti = 0;
float best = -1;
float x1 = dataset[old * 3 + 0];
float y1 = dataset[old * 3 + 1];
float z1 = dataset[old * 3 + 2];
for (int k = tid; k < n; k += stride)
{
float x2, y2, z2;
x2 = dataset[k * 3 + 0];
y2 = dataset[k * 3 + 1];
z2 = dataset[k * 3 + 2];
//float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
//if (mag <= 1e-3)
// continue;
float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
float d2 = min(d, temp[k]);
temp[k] = d2;
besti = d2 > best ? k : besti;
best = d2 > best ? d2 : best;
}
dists[tid] = best;
dists_i[tid] = besti;
__syncthreads();
if (block_size >= 1024) {
if (tid < 512) {
__update(dists, dists_i, tid, tid + 512);
}
__syncthreads();
}
if (block_size >= 512) {
if (tid < 256) {
__update(dists, dists_i, tid, tid + 256);
}
__syncthreads();
}
if (block_size >= 256) {
if (tid < 128) {
__update(dists, dists_i, tid, tid + 128);
}
__syncthreads();
}
if (block_size >= 128) {
if (tid < 64) {
__update(dists, dists_i, tid, tid + 64);
}
__syncthreads();
}
if (block_size >= 64) {
if (tid < 32) {
__update(dists, dists_i, tid, tid + 32);
}
__syncthreads();
}
if (block_size >= 32) {
if (tid < 16) {
__update(dists, dists_i, tid, tid + 16);
}
__syncthreads();
}
if (block_size >= 16) {
if (tid < 8) {
__update(dists, dists_i, tid, tid + 8);
}
__syncthreads();
}
if (block_size >= 8) {
if (tid < 4) {
__update(dists, dists_i, tid, tid + 4);
}
__syncthreads();
}
if (block_size >= 4) {
if (tid < 2) {
__update(dists, dists_i, tid, tid + 2);
}
__syncthreads();
}
if (block_size >= 2) {
if (tid < 1) {
__update(dists, dists_i, tid, tid + 1);
}
__syncthreads();
}
old = dists_i[0];
if (tid == 0)
idxs[j] = old;
}
}
void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs)
{
unsigned int n_threads = opt_n_threads(n);
switch (n_threads) {
case 1024:
furthestsampling_cuda_kernel<1024><<>>(b, n, m, dataset, temp, idxs);
break;
case 512:
furthestsampling_cuda_kernel<512><<>>(b, n, m, dataset, temp, idxs);
break;
case 256:
furthestsampling_cuda_kernel<256><<>>(b, n, m, dataset, temp, idxs);
break;
case 128:
furthestsampling_cuda_kernel<128><<>>(b, n, m, dataset, temp, idxs);
break;
case 64:
furthestsampling_cuda_kernel<64><<>>(b, n, m, dataset, temp, idxs);
break;
case 32:
furthestsampling_cuda_kernel<32><<>>(b, n, m, dataset, temp, idxs);
break;
case 16:
furthestsampling_cuda_kernel<16><<>>(b, n, m, dataset, temp, idxs);
break;
case 8:
furthestsampling_cuda_kernel<8><<>>(b, n, m, dataset, temp, idxs);
break;
case 4:
furthestsampling_cuda_kernel<4><<>>(b, n, m, dataset, temp, idxs);
break;
case 2:
furthestsampling_cuda_kernel<2><<>>(b, n, m, dataset, temp, idxs);
break;
case 1:
furthestsampling_cuda_kernel<1><<>>(b, n, m, dataset, temp, idxs);
break;
default:
furthestsampling_cuda_kernel<512><<>>(b, n, m, dataset, temp, idxs);
}
}
================================================
FILE: classification/modules/pointops/src/sampling/sampling_cuda_kernel.h
================================================
#ifndef _SAMPLING_CUDA_KERNEL
#define _SAMPLING_CUDA_KERNEL
#include
#include
#include
void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out);
void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points);
void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: classification/modules/polar_utils.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch
import numpy as np
def xyz2sphere(xyz, normalize=True):
"""
Convert XYZ to Spherical Coordinate
reference: https://en.wikipedia.org/wiki/Spherical_coordinate_system
:param xyz: [B, N, 3] / [B, N, G, 3]
:return: (rho, theta, phi) [B, N, 3] / [B, N, G, 3]
"""
rho = torch.sqrt(torch.sum(torch.pow(xyz, 2), dim=-1, keepdim=True))
rho = torch.clamp(rho, min=0) # range: [0, inf]
theta = torch.acos(xyz[..., 2, None] / rho) # range: [0, pi]
phi = torch.atan2(xyz[..., 1, None], xyz[..., 0, None]) # range: [-pi, pi]
# check nan
idx = rho == 0
theta[idx] = 0
if normalize:
theta = theta / np.pi # [0, 1]
phi = phi / (2 * np.pi) + .5 # [0, 1]
out = torch.cat([rho, theta, phi], dim=-1)
return out
def xyz2cylind(xyz, normalize=True):
"""
Convert XYZ to Cylindrical Coordinate
reference: https://en.wikipedia.org/wiki/Cylindrical_coordinate_system
:param normalize: Normalize phi & z
:param xyz: [B, N, 3] / [B, N, G, 3]
:return: (rho, phi, z) [B, N, 3]
"""
rho = torch.sqrt(torch.sum(torch.pow(xyz[..., :2], 2), dim=-1, keepdim=True))
rho = torch.clamp(rho, 0, 1) # range: [0, 1]
phi = torch.atan2(xyz[..., 1, None], xyz[..., 0, None]) # range: [-pi, pi]
z = xyz[..., 2, None]
z = torch.clamp(z, -1, 1) # range: [-1, 1]
if normalize:
phi = phi / (2 * np.pi) + .5
z = (z + 1.) / 2.
out = torch.cat([rho, phi, z], dim=-1)
return out
================================================
FILE: classification/modules/ptaug_utils.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch
#################
# MAIN
#################
def get_aug_args(args):
dataset = args.dataset
if dataset == 'ScanObjectNN':
aug_args = {'scale_factor': 0.5, 'shift_factor': 0.3}
return aug_args
else:
raise Exception('No such dataset')
def transform_point_cloud(batch, args, aug_args, train=True, label=None):
"""batch: B x 3/6 x N"""
if args.aug_scale:
batch[:, 0:3] = scale_point_cloud(batch[:, 0:3], aug_args['scale_factor'])
if args.aug_shift:
batch[:, 0:3] = shift_point_cloud(batch[:, 0:3], shift_range=aug_args['shift_factor'])
if label is not None:
return batch, label
return batch
#################
# Shift
#################
def shift_point_cloud(batch_data, shift_range=0.2):
""" Randomly shift point cloud. Shift is per point cloud.
Input:
B x C x N array, original batch of point clouds
Return:
B x C x N array, shifted batch of point clouds
"""
shifts = (torch.rand(batch_data.shape[0], 3, 1, device=batch_data.device) * 2. - 1.) * shift_range
batch_data += shifts
return batch_data
#################
# Scale
#################
def scale_point_cloud(batch_data, scale_range=0.2):
""" Randomly scale the point cloud. Scale is per point cloud.
Input:
B x C x N array, original batch of point clouds
Return:
B x C x N array, scaled batch of point clouds
"""
scales = (torch.rand(batch_data.shape[0], 3, 1, device=batch_data.device) * 2. - 1.) * scale_range + 1.
batch_data *= scales
return batch_data
================================================
FILE: classification/modules/recons_utils.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch
from torch import nn
from modules.pointnet2_utils import query_knn_point, index_points
def _recons_factory(type):
if type == 'knn':
return knn_recons
else:
raise Exception('Not Implemented Reconstruction Type')
def knn_recons(k, center, context, cuda=False):
idx = query_knn_point(k, context, center, cuda=cuda)
torch.cuda.empty_cache()
group_xyz = index_points(context, idx, cuda=cuda, is_group=True) # [B, N, K, C]
torch.cuda.empty_cache()
return group_xyz
def cal_normal(group_xyz, random_inv=False, is_group=False):
"""
Calculate Normal Vector (Unit Form + First Term Positive)
:param group_xyz: [B, N, K=3, 3] / [B, N, G, K=3, 3]
:param random_inv:
:param return_intersect:
:param return_const:
:return: [B, N, 3]
"""
edge_vec1 = group_xyz[..., 1, :] - group_xyz[..., 0, :] # [B, N, 3]
edge_vec2 = group_xyz[..., 2, :] - group_xyz[..., 0, :] # [B, N, 3]
nor = torch.cross(edge_vec1, edge_vec2, dim=-1)
unit_nor = nor / torch.norm(nor, dim=-1, keepdim=True) # [B, N, 3] / [B, N, G, 3]
if not is_group:
pos_mask = (unit_nor[..., 0] > 0).float() * 2. - 1. # keep x_n positive
else:
pos_mask = (unit_nor[..., 0:1, 0] > 0).float() * 2. - 1.
unit_nor = unit_nor * pos_mask.unsqueeze(-1)
# batch-wise random inverse normal vector (prob: 0.5)
if random_inv:
random_mask = torch.randint(0, 2, (group_xyz.size(0), 1, 1)).float() * 2. - 1.
random_mask = random_mask.to(unit_nor.device)
if not is_group:
unit_nor = unit_nor * random_mask
else:
unit_nor = unit_nor * random_mask.unsqueeze(-1)
return unit_nor
def pca(X, k, center=True):
"""
Principal Components Analysis impl. with SVD function
:param X:
:param k:
:param center:
:return:
"""
n = X.size()[0]
ones = torch.ones(n).view([n, 1])
h = ((1 / n) * torch.mm(ones, ones.t())) if center else torch.zeros(n * n).view([n, n])
H = torch.eye(n) - h
X_center = torch.mm(H.double(), X.double())
u, s, v = torch.svd(X_center)
components = v[:k].t()
explained_variance = torch.mul(s[:k], s[:k]) / (n - 1)
return {'X': X, 'k': k, 'components': components,
'explained_variance': explained_variance}
def cal_center(group_xyz):
"""
Calculate Global Coordinates of the Center of Triangle
:param group_xyz: [B, N, K, 3] / [B, N, G, K, 3]; K >= 3
:return: [B, N, 3] / [B, N, G, 3]
"""
center = torch.mean(group_xyz, dim=-2)
return center
def cal_area(group_xyz):
"""
Calculate Area of Triangle
:param group_xyz: [B, N, K, 3] / [B, N, G, K, 3]; K = 3
:return: [B, N, 1] / [B, N, G, 1]
"""
pad_shape = group_xyz[..., 0, None].shape
det_xy = torch.det(torch.cat([group_xyz[..., 0, None], group_xyz[..., 1, None], torch.ones(pad_shape)], dim=-1))
det_yz = torch.det(torch.cat([group_xyz[..., 1, None], group_xyz[..., 2, None], torch.ones(pad_shape)], dim=-1))
det_zx = torch.det(torch.cat([group_xyz[..., 2, None], group_xyz[..., 0, None], torch.ones(pad_shape)], dim=-1))
area = torch.sqrt(det_xy ** 2 + det_yz ** 2 + det_zx ** 2).unsqueeze(-1)
return area
def cal_const(normal, center, is_normalize=True):
"""
Calculate Constant Term (Standard Version, with x_normal to be 1)
math::
const = x_nor * x_0 + y_nor * y_0 + z_nor * z_0
:param is_normalize:
:param normal: [B, N, 3] / [B, N, G, 3]
:param center: [B, N, 3] / [B, N, G, 3]
:return: [B, N, 1] / [B, N, G, 1]
"""
const = torch.sum(normal * center, dim=-1, keepdim=True)
factor = torch.sqrt(torch.Tensor([3])).to(normal.device)
const = const / factor if is_normalize else const
return const
def check_nan(normal, center, pos=None):
"""
Check & Remove NaN in normal tensor
:param pos: [B, N, 1]
:param center: [B, N, 3]
:param normal: [B, N, 3]
:return:
"""
B, N, _ = normal.shape
mask = torch.sum(torch.isnan(normal), dim=-1) > 0
mask_first = torch.argmax((~mask).int(), dim=-1)
normal_first = normal[torch.arange(B), None, mask_first].repeat([1, N, 1])
normal[mask] = normal_first[mask]
center_first = center[torch.arange(B), None, mask_first].repeat([1, N, 1])
center[mask] = center_first[mask]
if pos is not None:
pos_first = pos[torch.arange(B), None, mask_first].repeat([1, N, 1])
pos[mask] = pos_first[mask]
return normal, center, pos
return normal, center
def check_nan_umb(normal, center, pos=None):
"""
Check & Remove NaN in normal tensor
:param pos: [B, N, G, 1]
:param center: [B, N, G, 3]
:param normal: [B, N, G, 3]
:return:
"""
B, N, G, _ = normal.shape
mask = torch.sum(torch.isnan(normal), dim=-1) > 0
mask_first = torch.argmax((~mask).int(), dim=-1)
b_idx = torch.arange(B).unsqueeze(1).repeat([1, N])
n_idx = torch.arange(N).unsqueeze(0).repeat([B, 1])
normal_first = normal[b_idx, n_idx, None, mask_first].repeat([1, 1, G, 1])
normal[mask] = normal_first[mask]
center_first = center[b_idx, n_idx, None, mask_first].repeat([1, 1, G, 1])
center[mask] = center_first[mask]
if pos is not None:
pos_first = pos[b_idx, n_idx, None, mask_first].repeat([1, 1, G, 1])
pos[mask] = pos_first[mask]
return normal, center, pos
return normal, center
class SurfaceConstructor(nn.Module):
"""
Surface Constructor for Point Clouds
Formulation of A Surface:
A * (x - x_0) + B * (y - y_0) + C * (z - z_0) = 0,
where A^2 + B^2 + C^2 = 1 & A > 0
"""
def __init__(self, r=None, k=3, recons_type='knn', return_dist=False, random_inv=True, cuda=False):
super(SurfaceConstructor, self).__init__()
self.K = k
self.R = r
self.recons = _recons_factory(recons_type)
self.cuda = cuda
self.return_dist = return_dist
self.random_inv = random_inv
def forward(self, center, context):
"""
Input:
center: input points position as centroid points, [B, 3, N]
context: input points position as context points, [B, 3, N']
Output:
normal: normals of constructed triangles, [B, 3, N]
center: centroids of constructed triangles, [B, 3, N]
pos: position info of constructed triangles, [B, 1, N]
"""
center = center.permute(0, 2, 1)
context = context.permute(0, 2, 1)
group_xyz = self.recons(self.K, center, context, cuda=self.cuda)
normal = cal_normal(group_xyz, random_inv=self.random_inv)
center = cal_center(group_xyz)
if self.return_dist:
pos = cal_const(normal, center)
normal, center, pos = check_nan(normal, center, pos)
normal = normal.permute(0, 2, 1)
center = center.permute(0, 2, 1)
pos = pos.permute(0, 2, 1)
return normal, center, pos
normal, center = check_nan(normal, center)
normal = normal.permute(0, 2, 1)
center = center.permute(0, 2, 1)
return normal, center
if __name__ == '__main__':
xyz = torch.rand(1, 3, 1024) * 2. - 1.
constructor = SurfaceConstructor(return_dist=True)
normal, center, pos = constructor(xyz, xyz)
print(normal.shape)
print(center.shape)
================================================
FILE: classification/modules/repsurface_utils.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from modules.pointnet2_utils import farthest_point_sample, index_points, query_knn_point, query_ball_point
from modules.polar_utils import xyz2sphere
from modules.recons_utils import cal_const, cal_normal, cal_center, check_nan_umb
def sample_and_group(npoint, radius, nsample, center, normal, feature, return_normal=True, return_polar=False, cuda=False):
"""
Input:
center: input points position data
normal: input points normal data
feature: input points feature
Return:
new_center: sampled points position data
new_normal: sampled points normal data
new_feature: sampled points feature
"""
# sample
fps_idx = farthest_point_sample(center, npoint, cuda=cuda) # [B, npoint, A]
torch.cuda.empty_cache()
# sample center
new_center = index_points(center, fps_idx, cuda=cuda, is_group=False)
torch.cuda.empty_cache()
# sample normal
new_normal = index_points(normal, fps_idx, cuda=cuda, is_group=False)
torch.cuda.empty_cache()
# group
idx = query_ball_point(radius, nsample, center, new_center, cuda=cuda)
torch.cuda.empty_cache()
# group normal
group_normal = index_points(normal, idx, cuda=cuda, is_group=True) # [B, npoint, nsample, B]
torch.cuda.empty_cache()
# group center
group_center = index_points(center, idx, cuda=cuda, is_group=True) # [B, npoint, nsample, A]
torch.cuda.empty_cache()
group_center_norm = group_center - new_center.unsqueeze(2)
torch.cuda.empty_cache()
# group polar
if return_polar:
group_polar = xyz2sphere(group_center_norm)
group_center_norm = torch.cat([group_center_norm, group_polar], dim=-1)
if feature is not None:
group_feature = index_points(feature, idx, cuda=cuda, is_group=True)
new_feature = torch.cat([group_center_norm, group_normal, group_feature], dim=-1) if return_normal \
else torch.cat([group_center_norm, group_feature], dim=-1)
else:
new_feature = torch.cat([group_center_norm, group_normal], dim=-1)
return new_center, new_normal, new_feature
def sample_and_group_all(center, normal, feature, return_normal=True, return_polar=False):
"""
Input:
center: input centroid position data
normal: input normal data
feature: input feature data
Return:
new_center: sampled points position data
new_normal: sampled points position data
new_feature: sampled points data
"""
device = center.device
B, N, C = normal.shape
new_center = torch.zeros(B, 1, 3).to(device)
new_normal = new_center
group_normal = normal.view(B, 1, N, C)
group_center = center.view(B, 1, N, 3)
if return_polar:
group_polar = xyz2sphere(group_center)
group_center = torch.cat([group_center, group_polar], dim=-1)
new_feature = torch.cat([group_center, group_normal, feature.view(B, 1, N, -1)], dim=-1) if return_normal \
else torch.cat([group_center, feature.view(B, 1, N, -1)], dim=-1)
return new_center, new_normal, new_feature
def resort_points(points, idx):
"""
Resort Set of points along G dim
"""
device = points.device
B, N, G, _ = points.shape
view_shape = [B, 1, 1]
repeat_shape = [1, N, G]
b_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
view_shape = [1, N, 1]
repeat_shape = [B, 1, G]
n_indices = torch.arange(N, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
new_points = points[b_indices, n_indices, idx, :]
return new_points
def group_by_umbrella(xyz, new_xyz, k=9, cuda=False):
"""
Group a set of points into umbrella surfaces
"""
idx = query_knn_point(k, xyz, new_xyz, cuda=cuda)
torch.cuda.empty_cache()
group_xyz = index_points(xyz, idx, cuda=cuda, is_group=True)[:, :, 1:] # [B, N', K-1, 3]
torch.cuda.empty_cache()
group_xyz_norm = group_xyz - new_xyz.unsqueeze(-2)
group_phi = xyz2sphere(group_xyz_norm)[..., 2] # [B, N', K-1]
sort_idx = group_phi.argsort(dim=-1) # [B, N', K-1]
# [B, N', K-1, 1, 3]
sorted_group_xyz = resort_points(group_xyz_norm, sort_idx).unsqueeze(-2)
sorted_group_xyz_roll = torch.roll(sorted_group_xyz, -1, dims=-3)
group_centriod = torch.zeros_like(sorted_group_xyz)
umbrella_group_xyz = torch.cat([group_centriod, sorted_group_xyz, sorted_group_xyz_roll], dim=-2)
return umbrella_group_xyz
class SurfaceAbstraction(nn.Module):
"""
Surface Abstraction Module
"""
def __init__(self, npoint, radius, nsample, in_channel, mlp, group_all, return_polar=True, return_normal=True, cuda=False):
super(SurfaceAbstraction, self).__init__()
self.npoint = npoint
self.radius = radius
self.nsample = nsample
self.return_normal = return_normal
self.return_polar = return_polar
self.cuda = cuda
self.group_all = group_all
self.mlp_convs = nn.ModuleList()
self.mlp_bns = nn.ModuleList()
last_channel = in_channel
for out_channel in mlp:
self.mlp_convs.append(nn.Conv2d(last_channel, out_channel, 1))
self.mlp_bns.append(nn.BatchNorm2d(out_channel))
last_channel = out_channel
def forward(self, center, normal, feature):
normal = normal.permute(0, 2, 1)
center = center.permute(0, 2, 1)
if feature is not None:
feature = feature.permute(0, 2, 1)
if self.group_all:
new_center, new_normal, new_feature = sample_and_group_all(center, normal, feature,
return_polar=self.return_polar,
return_normal=self.return_normal)
else:
new_center, new_normal, new_feature = sample_and_group(self.npoint, self.radius, self.nsample, center,
normal, feature, return_polar=self.return_polar,
return_normal=self.return_normal, cuda=self.cuda)
new_feature = new_feature.permute(0, 3, 2, 1)
for i, conv in enumerate(self.mlp_convs):
bn = self.mlp_bns[i]
new_feature = F.relu(bn(conv(new_feature)))
new_feature = torch.max(new_feature, 2)[0]
new_center = new_center.permute(0, 2, 1)
new_normal = new_normal.permute(0, 2, 1)
return new_center, new_normal, new_feature
class SurfaceAbstractionCD(nn.Module):
"""
Surface Abstraction Module
"""
def __init__(self, npoint, radius, nsample, feat_channel, pos_channel, mlp, group_all,
return_normal=True, return_polar=False, cuda=False):
super(SurfaceAbstractionCD, self).__init__()
self.npoint = npoint
self.radius = radius
self.nsample = nsample
self.return_normal = return_normal
self.return_polar = return_polar
self.cuda = cuda
self.mlp_convs = nn.ModuleList()
self.mlp_bns = nn.ModuleList()
self.pos_channel = pos_channel
self.group_all = group_all
self.mlp_l0 = nn.Conv2d(self.pos_channel, mlp[0], 1)
self.mlp_f0 = nn.Conv2d(feat_channel, mlp[0], 1)
self.bn_l0 = nn.BatchNorm2d(mlp[0])
self.bn_f0 = nn.BatchNorm2d(mlp[0])
# mlp_l0+mlp_f0 can be considered as the first layer of mlp_convs
last_channel = mlp[0]
for out_channel in mlp[1:]:
self.mlp_convs.append(nn.Conv2d(last_channel, out_channel, 1))
self.mlp_bns.append(nn.BatchNorm2d(out_channel))
last_channel = out_channel
def forward(self, center, normal, feature):
normal = normal.permute(0, 2, 1)
center = center.permute(0, 2, 1)
if feature is not None:
feature = feature.permute(0, 2, 1)
if self.group_all:
new_center, new_normal, new_feature = sample_and_group_all(center, normal, feature,
return_normal=self.return_normal,
return_polar=self.return_polar)
else:
new_center, new_normal, new_feature = sample_and_group(self.npoint, self.radius, self.nsample, center,
normal, feature, return_normal=self.return_normal,
return_polar=self.return_polar, cuda=self.cuda)
new_feature = new_feature.permute(0, 3, 2, 1)
# init layer
loc = self.bn_l0(self.mlp_l0(new_feature[:, :self.pos_channel]))
feat = self.bn_f0(self.mlp_f0(new_feature[:, self.pos_channel:]))
new_feature = loc + feat
new_feature = F.relu(new_feature)
for i, conv in enumerate(self.mlp_convs):
bn = self.mlp_bns[i]
new_feature = F.relu(bn(conv(new_feature)))
new_feature = torch.max(new_feature, 2)[0]
new_center = new_center.permute(0, 2, 1)
new_normal = new_normal.permute(0, 2, 1)
return new_center, new_normal, new_feature
class UmbrellaSurfaceConstructor(nn.Module):
"""
Umbrella-based Surface Abstraction Module
"""
def __init__(self, k, in_channel, aggr_type='sum', return_dist=False, random_inv=True, cuda=False):
super(UmbrellaSurfaceConstructor, self).__init__()
self.k = k
self.return_dist = return_dist
self.random_inv = random_inv
self.aggr_type = aggr_type
self.cuda = cuda
self.mlps = nn.Sequential(
nn.Conv2d(in_channel, in_channel, 1, bias=False),
nn.BatchNorm2d(in_channel),
nn.ReLU(True),
nn.Conv2d(in_channel, in_channel, 1, bias=True),
nn.BatchNorm2d(in_channel),
nn.ReLU(True),
nn.Conv2d(in_channel, in_channel, 1, bias=True),
)
def forward(self, center):
center = center.permute(0, 2, 1)
# surface construction
group_xyz = group_by_umbrella(center, center, k=self.k, cuda=self.cuda) # [B, N, K-1, 3 (points), 3 (coord.)]
# normal
group_normal = cal_normal(group_xyz, random_inv=self.random_inv, is_group=True)
# coordinate
group_center = cal_center(group_xyz)
# polar
group_polar = xyz2sphere(group_center)
if self.return_dist:
group_pos = cal_const(group_normal, group_center)
group_normal, group_center, group_pos = check_nan_umb(group_normal, group_center, group_pos)
new_feature = torch.cat([group_center, group_polar, group_normal, group_pos], dim=-1) # N+P+CP: 10
else:
group_normal, group_center = check_nan_umb(group_normal, group_center)
new_feature = torch.cat([group_center, group_polar, group_normal], dim=-1)
new_feature = new_feature.permute(0, 3, 2, 1) # [B, C, G, N]
# mapping
new_feature = self.mlps(new_feature)
# aggregation
if self.aggr_type == 'max':
new_feature = torch.max(new_feature, 2)[0]
elif self.aggr_type == 'avg':
new_feature = torch.mean(new_feature, 2)
else:
new_feature = torch.sum(new_feature, 2)
return new_feature
================================================
FILE: classification/scripts/scanobjectnn/repsurf_ssg_umb.sh
================================================
#!/usr/bin/env bash
set -v
python3 tool/train_cls_scanobjectnn.py \
--cuda_ops \
--batch_size 64 \
--model repsurf.repsurf_ssg_umb \
--epoch 250 \
--log_dir repsurf_cls_ssg_umb \
--gpus 0 \
--n_workers 12 \
--return_center \
--return_dist \
--return_polar \
--group_size 8 \
--umb_pool sum \
--num_point 1024
================================================
FILE: classification/scripts/scanobjectnn/repsurf_ssg_umb_2x.sh
================================================
#!/usr/bin/env bash
set -v
python3 tool/train_cls_scanobjectnn.py \
--cuda_ops \
--batch_size 64 \
--model repsurf.repsurf_ssg_umb_2x \
--epoch 250 \
--log_dir repsurf_cls_ssg_umb_2x \
--gpus 0 \
--n_workers 12 \
--return_center \
--return_dist \
--return_polar \
--group_size 8 \
--umb_pool sum \
--num_point 1024
================================================
FILE: classification/tool/train_cls_scanobjectnn.py
================================================
"""
Author: Haoxi Ran
Date: 05/10/2022
"""
from functools import partial
import argparse
import numpy as np
import os
import torch
import datetime
import logging
from pathlib import Path
from dataset.ScanObjectNNDataLoader import ScanObjectNNDataLoader
from modules.ptaug_utils import transform_point_cloud, scale_point_cloud, get_aug_args
from modules.pointnet2_utils import sample
from util.utils import get_model, get_loss, set_seed, weight_init
def parse_args():
"""PARAMETERS"""
parser = argparse.ArgumentParser('RepSurf')
# Basic
parser.add_argument('--log_dir', type=str, default=None, help='experiment root')
parser.add_argument('--data_dir', type=str, default='./data', help='data dir')
parser.add_argument('--log_root', type=str, default='./log', help='log root dir')
parser.add_argument('--model', default='repsurf.scanobjectnn.repsurf_ssg_umb',
help='model file name [default: repsurf_ssg_umb]')
parser.add_argument('--gpus', nargs='+', type=str, default=None)
parser.add_argument('--seed', type=int, default=2800, help='Training Seed')
parser.add_argument('--cuda_ops', action='store_true', default=False,
help='Whether to use cuda version operations [default: False]')
# Training
parser.add_argument('--batch_size', type=int, default=64, help='batch size in training [default: 64]')
parser.add_argument('--optimizer', type=str, default='Adam', help='optimizer for training [Adam, SGD]')
parser.add_argument('--scheduler', type=str, default='step', help='scheduler for training')
parser.add_argument('--epoch', default=500, type=int, help='number of epoch in training [default: 500]')
parser.add_argument('--learning_rate', default=0.001, type=float, help='learning rate in training [default: 0.001]')
parser.add_argument('--decay_rate', type=float, default=1e-4, help='decay rate [default: 1e-4]')
parser.add_argument('--decay_step', default=20, type=int, help='number of epoch per decay [default: 20]')
parser.add_argument('--n_workers', type=int, default=4, help='DataLoader Workers Number [default: 4]')
parser.add_argument('--init', type=str, default=None, help='initializer for model [kaiming, xavier]')
# Evaluation
parser.add_argument('--min_val', type=int, default=100, help='Min val epoch [default: 100]')
# Augmentation
parser.add_argument('--aug_scale', action='store_true', default=False,
help='Whether to augment by scaling [default: False]')
parser.add_argument('--aug_shift', action='store_true', default=False,
help='Whether to augment by shifting [default: False]')
# Modeling
parser.add_argument('--num_point', type=int, default=1024, help='Point Number [default: 1024]')
parser.add_argument('--return_dist', action='store_true', default=False,
help='Whether to use signed distance [default: False]')
parser.add_argument('--return_center', action='store_true', default=False,
help='Whether to return center in surface abstraction [default: False]')
parser.add_argument('--return_polar', action='store_true', default=False,
help='Whether to return polar coordinate in surface abstraction [default: False]')
parser.add_argument('--group_size', type=int, default=8, help='Size of umbrella group [default: 8]')
parser.add_argument('--umb_pool', type=str, default='sum', help='pooling for umbrella repsurf [sum, mean, max]')
return parser.parse_args()
def test(model, loader, num_class=15, num_point=1024, num_votes=10, total_num=1):
vote_correct = 0
sing_correct = 0
classifier = model.eval()
for j, data in enumerate(loader):
points, target = data
points, target = points.cuda(), target.cuda()
# preprocess
points = sample(num_point, points)
# vote
vote_pool = torch.zeros(target.shape[0], num_class).cuda()
for i in range(num_votes):
new_points = points.clone()
# scale
if i > 0:
new_points[:, :3] = scale_point_cloud(new_points[:, :3])
# predict
pred = classifier(new_points)
# single
if i == 0:
sing_pred = pred
# vote
vote_pool += pred
vote_pred = vote_pool / num_votes
# single pred
sing_pred_choice = sing_pred.data.max(1)[1]
sing_correct += sing_pred_choice.eq(target.long().data).cpu().sum()
# vote pred
vote_pred_choice = vote_pred.data.max(1)[1]
vote_correct += vote_pred_choice.eq(target.long().data).cpu().sum()
sing_acc = sing_correct.item() / total_num
vote_acc = vote_correct.item() / total_num
return sing_acc, vote_acc
def main(args):
def log_string(s):
logger.info(s)
print(s)
'''HYPER PARAMETER'''
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(args.gpus)
set_seed(args.seed)
'''CREATE DIR'''
experiment_dir = Path(os.path.join(args.log_root, 'PointAnalysis', 'log'))
experiment_dir.mkdir(exist_ok=True)
experiment_dir = experiment_dir.joinpath('ScanObjectNN')
experiment_dir.mkdir(exist_ok=True)
if args.log_dir is None:
timestr = str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))
experiment_dir = experiment_dir.joinpath(timestr)
else:
experiment_dir = experiment_dir.joinpath(args.log_dir)
experiment_dir.mkdir(exist_ok=True)
checkpoints_dir = experiment_dir.joinpath('checkpoints/')
checkpoints_dir.mkdir(exist_ok=True)
log_dir = experiment_dir.joinpath('logs/')
log_dir.mkdir(exist_ok=True)
'''LOG'''
args = parse_args()
logger = logging.getLogger("Model")
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler('%s/%s.txt' % (log_dir, args.model))
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
log_string('PARAMETER ...')
log_string(args)
'''DATA LOADING'''
log_string('Load dataset ...')
args.num_class = 15
args.dataset = 'ScanObjectNN'
args.normal = False
aug_args = get_aug_args(args)
DATA_PATH = os.path.join(args.data_dir, 'ScanObjectNN')
TRAIN_DATASET = ScanObjectNNDataLoader(root=DATA_PATH, split='training')
TEST_DATASET = ScanObjectNNDataLoader(root=DATA_PATH, split='test')
trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET, batch_size=args.batch_size, shuffle=True,
num_workers=args.n_workers, drop_last=True)
testDataLoader = torch.utils.data.DataLoader(TEST_DATASET, batch_size=args.batch_size, shuffle=False,
num_workers=args.n_workers)
'''MODEL BUILDING'''
classifier = torch.nn.DataParallel(get_model(args)).cuda()
criterion = get_loss().cuda()
try:
checkpoint = torch.load(str(experiment_dir) + '/checkpoints/best_model.pth')
start_epoch = checkpoint['epoch']
classifier.load_state_dict(checkpoint['model_state_dict'])
log_string('Use pretrain model')
except:
log_string('No existing model, starting training from scratch...')
start_epoch = 0
if args.init:
init_func = partial(weight_init, init_type=args.init)
classifier = classifier.apply(init_func)
'''OPTIMIZER'''
if args.optimizer == 'Adam':
optimizer = torch.optim.Adam(
classifier.parameters(),
lr=args.learning_rate,
betas=(0.9, 0.999),
eps=1e-08,
weight_decay=args.decay_rate)
elif args.optimizer == 'SGD':
optimizer = torch.optim.SGD(
classifier.parameters(),
lr=args.learning_rate,
momentum=0.9)
'''LR SCHEDULER'''
if args.scheduler == 'step':
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_step, gamma=0.7)
else:
raise Exception('No Such Scheduler')
global_epoch = 0
global_step = 0
best_sing_acc = 0.0
best_vote_acc = 0.0
loader_len = len(trainDataLoader)
'''TRANING'''
logger.info('Start training...')
for epoch in range(start_epoch, args.epoch):
log_string('Epoch %d (%d/%s):' % (global_epoch + 1, epoch + 1, args.epoch))
train_loss = []
train_correct = 0
scheduler.step()
for batch_id, data in enumerate(trainDataLoader):
'''INPUT'''
points, target = data
points, target = points.cuda(), target.cuda()
'''PREPROCESS'''
points = sample(args.num_point, points)
points = transform_point_cloud(points, args, aug_args)
'''FORWARD'''
optimizer.zero_grad()
lr = optimizer.state_dict()['param_groups'][0]['lr']
classifier = classifier.train()
pred = classifier(points)
loss = criterion(pred, target.long())
pred_choice = pred.data.max(1)[1]
correct = pred_choice.eq(target.long().data).cpu().sum()
train_correct += correct
train_loss.append(loss.item())
'''BACKWARD'''
loss.backward()
optimizer.step()
global_step += 1
if batch_id % 80 == 0:
print('Epoch: [{0}][{1}/{2}] lr {lr:.6f} loss {loss:.4f}'.
format(epoch, batch_id, len(trainDataLoader), lr=lr, loss=loss.item()))
train_instance_acc = train_correct.item() / (loader_len * args.batch_size)
train_mean_loss = np.mean(train_loss)
log_string('Train Instance Accuracy: %.2f, Loss: %f' % (train_instance_acc * 100, train_mean_loss))
if epoch >= args.min_val:
with torch.no_grad():
sing_acc, vote_acc = test(classifier.eval(), testDataLoader, num_point=args.num_point,
total_num=len(TEST_DATASET))
if sing_acc >= best_sing_acc:
best_sing_acc = sing_acc
if vote_acc >= best_vote_acc:
best_vote_acc = vote_acc
best_epoch = epoch + 1
log_string('Test Single Accuracy: %.2f' % (sing_acc * 100))
log_string('Best Single Accuracy: %.2f' % (best_sing_acc * 100))
log_string('Test Vote Accuracy: %.2f' % (vote_acc * 100))
log_string('Best Vote Accuracy: %.2f' % (best_vote_acc * 100))
if vote_acc >= best_vote_acc:
logger.info('Save model...')
savepath = str(checkpoints_dir) + '/best_model.pth'
log_string('Saving at %s' % savepath)
state = {
'epoch': best_epoch,
'vote_acc': vote_acc,
'model_state_dict': classifier.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}
torch.save(state, savepath)
global_epoch += 1
logger.info('End of training...')
if __name__ == '__main__':
args = parse_args()
main(args)
================================================
FILE: classification/util/__init__.py
================================================
================================================
FILE: classification/util/utils.py
================================================
import importlib
import argparse
import random
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
def set_seed(seed):
"""
Setting of Global Seed
"""
torch.backends.cudnn.enabled = True
torch.backends.cudnn.deterministic = True # consistent results on the cpu and gpu
torch.backends.cudnn.benchmark = True
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed) # cpu
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # gpu
def weight_init(m, init_type):
if init_type == 'xavier':
init_func = torch.nn.init.xavier_normal_
elif init_type == 'kaiming':
init_func = torch.nn.init.kaiming_normal_
else:
raise Exception('No such init type')
if isinstance(m, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.Conv1d)):
init_func(m.weight)
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, (torch.nn.BatchNorm2d, torch.nn.BatchNorm1d)):
torch.nn.init.constant_(m.weight, 1) # constant
# torch.nn.init.normal_(m.weight, 1.0, 0.02) # normal
torch.nn.init.constant_(m.bias, 0)
class ClsLoss(nn.Module):
def __init__(self):
super(ClsLoss, self).__init__()
def forward(self, pred, target):
total_loss = F.nll_loss(pred, target)
return total_loss
class SmoothClsLoss(nn.Module):
def __init__(self, smoothing_ratio=0.1):
super(SmoothClsLoss, self).__init__()
self.smoothing_ratio = smoothing_ratio
def forward(self, pred, target):
eps = self.smoothing_ratio
n_class = pred.size(1)
one_hot = torch.zeros_like(pred).scatter(1, target.view(-1, 1), 1)
one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
# log_prb = F.log_softmax(pred, dim=1)
loss = -(one_hot * pred).sum(dim=1).mean()
return loss
def get_model(args):
module = importlib.import_module('models.%s' % args.model)
return module.Model(args)
def get_loss():
return SmoothClsLoss()
def get_test_args():
return argparse.Namespace()
================================================
FILE: segmentation/README.md
================================================
# RepSurf for Segmentation
By *[Haoxi Ran\*](https://hancyran.github.io/) , Jun Liu, Chengjie Wang* ( * : corresponding contact)
### [PDF](https://openaccess.thecvf.com/content/CVPR2022/papers/Ran_Surface_Representation_for_Point_Clouds_CVPR_2022_paper.pdf) | [arXiv](http://arxiv.org/abs/2205.05740)
## Preparation
### Environment
We tested under the environment:
* python 3.7
* pytorch 1.6.0 / 1.8.0
* cuda 10.1 / 11.1
* gcc 7.2.0
* h5py
* sharedarray
* tensorboardx
For anaconda user, initialize the conda environment **repsurf-seg** by:
```
sh init.sh
```
## Experiments
### S3DIS Area-5 (Data & Logs: [Google Drive](https://drive.google.com/drive/folders/1jIZuy4RPFJ4YHAE8ScVQgwtBwNGgfKnv?usp=sharing))
* Performance using the same settings:
**Note**:
1. The performance (mIoU/mAcc/OA) are from the final predictions on the whole scenes of S3DIS Area-5, while the results during training is on sub-sampled scenes for fast validation.
2. The training time of all above implementations is estimated on four NVIDIA RTX 3090. The time in the logs contains both training and validating time.
3. To speed up the training process, we apply Sectorized FPS (in the first stage) for all above methods. It can save 30~40% training time and does not affect the performance.
4. To lessen the instability from grid sampling during inference, we apply median filtering to all the above implementations. Besides, it can slightly improve the results (~0.4 mIoU).
* To (firstly install gdown by **pip install gdown** and) download dataset:
```
cd ./data/S3DIS
gdown https://drive.google.com/u/1/uc?id=1UDM-bjrtqoIR9FWoIRyqLUJGyKEs22fP
tar zxf s3dis.tar.gz && rm s3dis.tar.gz && cd -
```
* To train one model (**Umbrella RepSurf, Point Transformer, PointNet2**) for S3DIS Area-5:
```
sh scripts/s3dis/train_[MODEL].sh # MODEL: repsurf_umb, pointnet2, pointtransformer
```
* To test one model (**Our Umbrella RepSurf, Point Transformer, PointNet2**) for S3DIS Area-5 on whole scenes:
```
sh scripts/s3dis/test_[MODEL].sh # MODEL: repsurf_umb, pointnet2, pointtransformer
```
## Acknowledgment
We thank the [Point Transformer Implementation](https://github.com/POSTECH-CVLab/point-transformer) for the library pointops.
## License
RepSurf is under the Apache-2.0 license. Please contact the primary author **Haoxi Ran (ranhaoxi@gmail.com)** for
commercial use.
================================================
FILE: segmentation/dataset/S3DISDataLoader.py
================================================
"""
Author: Haoxi Ran
Date: 06/30/2022
"""
import os
import numpy as np
import SharedArray as SA
from torch.utils.data import Dataset
from util.data_util import sa_create, data_prepare
NUM_CLASS = 13
class S3DIS(Dataset):
def __init__(self, args, split, coord_transform=None, rgb_transform=None,
rgb_mean=None, rgb_std=None, shuffle_index=False):
super().__init__()
self.args, self.split, self.coord_transform, self.rgb_transform, self.rgb_mean, self.rgb_std, self.shuffle_index = \
args, split, coord_transform, rgb_transform, rgb_mean, rgb_std, shuffle_index
self.stop_aug = False
data_list = sorted(os.listdir(args.data_dir))
data_list = [item[:-4] for item in data_list if 'Area_' in item]
if split == 'train':
self.data_list = [item for item in data_list if not 'Area_{}'.format(args.test_area) in item]
else:
self.data_list = [item for item in data_list if 'Area_{}'.format(args.test_area) in item]
self.data_idx = np.arange(len(self.data_list))
for item in self.data_list:
if not os.path.exists("/dev/shm/s3dis_{}".format(item)):
data_path = os.path.join(args.data_dir, item + '.npy')
data = np.load(data_path).astype(np.float32) # xyzrgbl, N*7
sa_create("shm://s3dis_{}".format(item), data)
def __getitem__(self, idx):
data_idx = self.data_idx[idx % len(self.data_idx)]
data = SA.attach("shm://s3dis_{}".format(self.data_list[data_idx])).copy()
coord, feat, label = data[:, 0:3], data[:, 3:6], data[:, 6]
coord, feat, label = \
data_prepare(coord, feat, label, self.args, self.split, self.coord_transform, self.rgb_transform,
self.rgb_mean, self.rgb_std, self.shuffle_index, self.stop_aug)
return coord, feat, label
def __len__(self):
return len(self.data_idx) * self.args.loop
@staticmethod
def print_weight(data_root, data_list):
print('Computing label weight...')
num_point_list = []
label_freq = np.zeros(NUM_CLASS)
label_total = np.zeros(NUM_CLASS)
# load data
for idx, item in enumerate(data_list):
data_path = os.path.join(data_root, item + '.npy')
data = np.load(data_path)
labels = data[:, 6]
freq = np.histogram(labels, range(NUM_CLASS + 1))[0]
label_freq += freq
label_total += (freq > 0).astype(np.float) * labels.size
num_point_list.append(labels.size)
# label weight
label_freq = label_freq / label_total
label_weight = np.median(label_freq) / label_freq
print(label_weight)
@staticmethod
def print_mean_std(data_root, data_list):
print('Computing color mean & std...')
point_list = []
for idx, item in enumerate(data_list):
data_path = os.path.join(data_root, item + '.npy')
data = np.load(data_path)
point_list.append(data[:, 3:6])
points = np.vstack(point_list) / 255.
mean = np.mean(points, 0)
std = np.std(points, 0)
print(f'mean: {mean}, std:{std}')
================================================
FILE: segmentation/dataset/__init__.py
================================================
================================================
FILE: segmentation/init.sh
================================================
#!/bin/sh
mkdir -p log/PointAnalysis/log/S3DIS
mkdir -p log/PointAnalysis/log/ScanNet
mkdir -p data/S3DIS
mkdir -p data/ScanNet
conda create -n repsurf-seg python=3.7 -y
conda activate repsurf-seg
#conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 -c pytorch -c conda-forge -y
pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
conda install -c anaconda h5py pyyaml -y
conda install -c conda-forge sharedarray tensorboardx -y
cd modules/pointops
python3 setup.py install
cd -
================================================
FILE: segmentation/models/__init__.py
================================================
================================================
FILE: segmentation/models/pointnet2/__init__.py
================================================
================================================
FILE: segmentation/models/pointnet2/pointnet2_ssg.py
================================================
"""
Author: Haoxi Ran
Date: 06/30/2022
"""
import torch
import torch.nn as nn
from modules.pointnet2_utils import PointNetSetAbstraction, PointNetFeaturePropagation
class Model(nn.Module):
def __init__(self, args):
super(Model, self).__init__()
self.sa1 = PointNetSetAbstraction(4, 32, 6 + 3, [32, 32, 64], num_sector=4)
self.sa2 = PointNetSetAbstraction(4, 32, 64 + 3, [64, 64, 128])
self.sa3 = PointNetSetAbstraction(4, 32, 128 + 3, [128, 128, 256])
self.sa4 = PointNetSetAbstraction(4, 32, 256 + 3, [256, 256, 512])
self.fp4 = PointNetFeaturePropagation(768, [256, 256])
self.fp3 = PointNetFeaturePropagation(384, [256, 256])
self.fp2 = PointNetFeaturePropagation(320, [256, 128])
self.fp1 = PointNetFeaturePropagation(128, [128, 128, 128])
self.classifier = nn.Sequential(
nn.Linear(128, 128),
nn.BatchNorm1d(128),
nn.ReLU(True),
nn.Dropout(0.5),
nn.Linear(128, args.num_class),
)
def forward(self, pos_feat_off0):
pos_feat_off0[1] = torch.cat([pos_feat_off0[0], pos_feat_off0[1]], 1)
pos_feat_off1 = self.sa1(pos_feat_off0)
pos_feat_off2 = self.sa2(pos_feat_off1)
pos_feat_off3 = self.sa3(pos_feat_off2)
pos_feat_off4 = self.sa4(pos_feat_off3)
pos_feat_off3[1] = self.fp4(pos_feat_off3, pos_feat_off4)
pos_feat_off2[1] = self.fp3(pos_feat_off2, pos_feat_off3)
pos_feat_off1[1] = self.fp2(pos_feat_off1, pos_feat_off2)
pos_feat_off0[1] = self.fp1([pos_feat_off0[0], None, pos_feat_off0[2]], pos_feat_off1)
feature = self.classifier(pos_feat_off0[1])
return feature
================================================
FILE: segmentation/models/pointtransformer/__init__.py
================================================
================================================
FILE: segmentation/models/pointtransformer/pointtransformer.py
================================================
import torch
import torch.nn as nn
from modules.pointtransformer_utils import PointTransformerBlock, TransitionDown, TransitionUp
class Model(nn.Module):
def __init__(self, args):
super().__init__()
block = PointTransformerBlock
num_block = [2, 3, 4, 6, 3]
self.in_c = args.in_channel
self.in_planes, planes = self.in_c, [32, 64, 128, 256, 512]
fpn_planes, fpnhead_planes, share_planes = 128, 64, 8
stride, nsample = [1, 4, 4, 4, 4], [16, 16, 16, 16, 16]
self.enc1 = self._make_enc(block, planes[0], num_block[0], share_planes, stride=stride[0],
nsample=nsample[0]) # N/1
self.enc2 = self._make_enc(block, planes[1], num_block[1], share_planes, stride=stride[1],
nsample=nsample[1], num_sector=4) # N/4
self.enc3 = self._make_enc(block, planes[2], num_block[2], share_planes, stride=stride[2],
nsample=nsample[2]) # N/16
self.enc4 = self._make_enc(block, planes[3], num_block[3], share_planes, stride=stride[3],
nsample=nsample[3]) # N/64
self.enc5 = self._make_enc(block, planes[4], num_block[4], share_planes, stride=stride[4],
nsample=nsample[4]) # N/256
self.dec5 = self._make_dec(block, planes[4], 2, share_planes, nsample=nsample[4], is_head=True) # transform p5
self.dec4 = self._make_dec(block, planes[3], 2, share_planes, nsample=nsample[3]) # fusion p5 and p4
self.dec3 = self._make_dec(block, planes[2], 2, share_planes, nsample=nsample[2]) # fusion p4 and p3
self.dec2 = self._make_dec(block, planes[1], 2, share_planes, nsample=nsample[1]) # fusion p3 and p2
self.dec1 = self._make_dec(block, planes[0], 2, share_planes, nsample=nsample[0]) # fusion p2 and p1
self.cls = nn.Sequential(nn.Linear(planes[0], planes[0]), nn.BatchNorm1d(planes[0]), nn.ReLU(inplace=True),
nn.Linear(planes[0], args.num_class))
def _make_enc(self, block, planes, blocks, share_planes=8, stride=1, nsample=16, num_sector=1):
layers = [TransitionDown(self.in_planes, planes * block.expansion, stride, nsample, num_sector)]
self.in_planes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_planes, self.in_planes, share_planes, nsample=nsample))
return nn.Sequential(*layers)
def _make_dec(self, block, planes, blocks, share_planes=8, nsample=16, is_head=False):
layers = [TransitionUp(self.in_planes, None if is_head else planes * block.expansion)]
self.in_planes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_planes, self.in_planes, share_planes, nsample=nsample))
return nn.Sequential(*layers)
def forward(self, pxo, *args):
p0, x0, o0 = pxo # (n, 3), (n, c), (b)
x0 = p0 if self.in_c == 3 else torch.cat((p0, x0), 1)
p1, x1, o1 = self.enc1([p0, x0, o0])
p2, x2, o2 = self.enc2([p1, x1, o1])
p3, x3, o3 = self.enc3([p2, x2, o2])
p4, x4, o4 = self.enc4([p3, x3, o3])
p5, x5, o5 = self.enc5([p4, x4, o4])
x5 = self.dec5[1:]([p5, self.dec5[0]([p5, x5, o5]), o5])[1]
x4 = self.dec4[1:]([p4, self.dec4[0]([p4, x4, o4], [p5, x5, o5]), o4])[1]
x3 = self.dec3[1:]([p3, self.dec3[0]([p3, x3, o3], [p4, x4, o4]), o3])[1]
x2 = self.dec2[1:]([p2, self.dec2[0]([p2, x2, o2], [p3, x3, o3]), o2])[1]
x1 = self.dec1[1:]([p1, self.dec1[0]([p1, x1, o1], [p2, x2, o2]), o1])[1]
x = self.cls(x1)
return x
================================================
FILE: segmentation/models/repsurf/__init__.py
================================================
================================================
FILE: segmentation/models/repsurf/repsurf_umb_ssg.py
================================================
"""
Author: Haoxi Ran
Date: 06/30/2022
"""
import torch
import torch.nn as nn
from modules.repsurface_utils import UmbrellaSurfaceConstructor, SurfaceAbstractionCD, SurfaceFeaturePropagationCD
class Model(nn.Module):
def __init__(self, args):
super(Model, self).__init__()
center_channel = 6 if args.return_polar else 3
repsurf_in_channel = 10
repsurf_out_channel = 10
self.sa1 = SurfaceAbstractionCD(4, 32, args.in_channel + repsurf_out_channel, center_channel, [32, 32, 64],
True, args.return_polar, num_sector=4)
self.sa2 = SurfaceAbstractionCD(4, 32, 64 + repsurf_out_channel, center_channel, [64, 64, 128],
True, args.return_polar)
self.sa3 = SurfaceAbstractionCD(4, 32, 128 + repsurf_out_channel, center_channel, [128, 128, 256],
True, args.return_polar)
self.sa4 = SurfaceAbstractionCD(4, 32, 256 + repsurf_out_channel, center_channel, [256, 256, 512],
True, args.return_polar)
self.fp4 = SurfaceFeaturePropagationCD(512, 256, [256, 256])
self.fp3 = SurfaceFeaturePropagationCD(256, 128, [256, 256])
self.fp2 = SurfaceFeaturePropagationCD(256, 64, [256, 128])
self.fp1 = SurfaceFeaturePropagationCD(128, None, [128, 128, 128])
self.classifier = nn.Sequential(
nn.Linear(128, 128),
nn.BatchNorm1d(128),
nn.ReLU(True),
nn.Dropout(0.5),
nn.Linear(128, args.num_class),
)
self.surface_constructor = UmbrellaSurfaceConstructor(args.group_size + 1, repsurf_in_channel, repsurf_out_channel)
def forward(self, pos_feat_off0):
pos_nor_feat_off0 = [
pos_feat_off0[0],
self.surface_constructor(pos_feat_off0[0], pos_feat_off0[2]),
torch.cat([pos_feat_off0[0], pos_feat_off0[1]], 1),
pos_feat_off0[2]
]
pos_nor_feat_off1 = self.sa1(pos_nor_feat_off0)
pos_nor_feat_off2 = self.sa2(pos_nor_feat_off1)
pos_nor_feat_off3 = self.sa3(pos_nor_feat_off2)
pos_nor_feat_off4 = self.sa4(pos_nor_feat_off3)
del pos_nor_feat_off0[1], pos_nor_feat_off1[1], pos_nor_feat_off2[1], pos_nor_feat_off3[1], pos_nor_feat_off4[1]
pos_nor_feat_off3[1] = self.fp4(pos_nor_feat_off3, pos_nor_feat_off4)
pos_nor_feat_off2[1] = self.fp3(pos_nor_feat_off2, pos_nor_feat_off3)
pos_nor_feat_off1[1] = self.fp2(pos_nor_feat_off1, pos_nor_feat_off2)
pos_nor_feat_off0[1] = self.fp1([pos_nor_feat_off0[0], None, pos_nor_feat_off0[2]], pos_nor_feat_off1)
feature = self.classifier(pos_nor_feat_off0[1])
return feature
================================================
FILE: segmentation/modules/__init__.py
================================================
================================================
FILE: segmentation/modules/aug_utils.py
================================================
"""
Author: Haoxi Ran
Date: 06/30/2022
"""
import numpy as np
def transform_point_cloud_coord(args):
transform_list = []
aug_args = args.aug_args
if args.aug_scale:
transform_list.append(
RandomScale(aug_args['scale_factor'], aug_args['scale_ani'], aug_args['scale_prob']))
if args.aug_rotate:
if args.aug_rotate == 'pert':
transform_list.append(
RandomRotatePerturb(aug_args['pert_factor'], 3 * aug_args['pert_factor'], aug_args['pert_prob']))
elif args.aug_rotate == 'pert_z':
transform_list.append(
RandomRotatePerturbAligned(aug_args['pert_factor'], 3 * aug_args['pert_factor'], aug_args['pert_prob']))
elif args.aug_rotate == 'rot':
transform_list.append(
RandomRotate(prob=aug_args['rot_prob']))
elif args.aug_rotate == 'rot_z':
transform_list.append(
RandomRotateAligned(prob=aug_args['rot_prob']))
if args.aug_jitter:
transform_list.append(
RandomJitter(aug_args['jitter_factor'], 5 * aug_args['jitter_factor'], aug_args['jitter_prob'], args.lidar))
if args.aug_flip:
transform_list.append(RandomFlip())
if args.aug_shift:
transform_list.append(RandomShift(aug_args['shifts'], aug_args['shift_prob']))
return Compose(transform_list) if len(transform_list) > 0 else None
def transform_point_cloud_rgb(args):
transform_list = []
aug_args = args.aug_args
if args.color_contrast:
transform_list.append(ChromaticAutoContrast())
if args.color_shift:
transform_list.append(ChromaticTranslation())
if args.color_jitter:
transform_list.append(ChromaticJitter())
if args.hs_shift:
transform_list.append(HueSaturationTranslation())
if args.color_drop:
transform_list.append(RandomDropColor())
return Compose(transform_list) if len(transform_list) > 0 else None
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, coord, feat, label, mask=None):
for t in self.transforms:
coord, feat, label = t(coord, feat, label, mask)
return coord, feat, label
class RandomRotate(object):
def __init__(self, rot=(np.pi/24, np.pi/24, np.pi/4), prob=1.):
self.rot = rot
self.prob = prob
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.prob:
angle_x = np.random.uniform(-self.rot[0], self.rot[0])
angle_y = np.random.uniform(-self.rot[1], self.rot[1])
angle_z = np.random.uniform(-self.rot[2], self.rot[2])
cos_x, sin_x = np.cos(angle_x), np.sin(angle_x)
cos_y, sin_y = np.cos(angle_y), np.sin(angle_y)
cos_z, sin_z = np.cos(angle_z), np.sin(angle_z)
R_x = np.array([[1, 0, 0], [0, cos_x, -sin_x], [0, sin_x, cos_x]])
R_y = np.array([[cos_y, 0, sin_y], [0, 1, 0], [-sin_y, 0, cos_y]])
R_z = np.array([[cos_z, -sin_z, 0], [sin_z, cos_z, 0], [0, 0, 1]])
R = np.dot(R_z, np.dot(R_y, R_x))
coord = np.dot(coord, np.transpose(R))
return coord, feat, label
class RandomRotateAligned(object):
def __init__(self, rot=np.pi, prob=1.):
self.rot = rot
self.prob = prob
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.prob:
angle_z = np.random.uniform(-self.rot, self.rot)
cos_z, sin_z = np.cos(angle_z), np.sin(angle_z)
R = np.array([[cos_z, -sin_z, 0], [sin_z, cos_z, 0], [0, 0, 1]])
coord = np.dot(coord, R)
return coord, feat, label
class RandomRotatePerturb(object):
def __init__(self, sigma=0.03, clip=0.09, prob=1.):
self.sigma = sigma
self.clip = clip
self.prob = prob
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.prob:
angle_x = np.clip(np.random.normal() * self.sigma, -self.clip, self.clip)
angle_y = np.clip(np.random.normal() * self.sigma, -self.clip, self.clip)
angle_z = np.clip(np.random.normal() * self.sigma, -self.clip, self.clip)
cos_x, sin_x = np.cos(angle_x), np.sin(angle_x)
cos_y, sin_y = np.cos(angle_y), np.sin(angle_y)
cos_z, sin_z = np.cos(angle_z), np.sin(angle_z)
R_x = np.array([[1, 0, 0], [0, cos_x, -sin_x], [0, sin_x, cos_x]])
R_y = np.array([[cos_y, 0, sin_y], [0, 1, 0], [-sin_y, 0, cos_y]])
R_z = np.array([[cos_z, -sin_z, 0], [sin_z, cos_z, 0], [0, 0, 1]])
R = np.dot(R_z, np.dot(R_y, R_x))
coord = np.dot(coord, np.transpose(R))
return coord, feat, label
class RandomRotatePerturbAligned(object):
def __init__(self, sigma=0.03, clip=0.09, prob=1.):
self.sigma = sigma
self.clip = clip
self.prob = prob
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.prob:
angle_z = np.clip(np.random.normal() * self.sigma, -self.clip, self.clip)
cos_z, sin_z = np.cos(angle_z), np.sin(angle_z)
R = np.array([[cos_z, -sin_z, 0], [sin_z, cos_z, 0], [0, 0, 1]])
coord = np.dot(coord, R)
return coord, feat, label
class RandomScale(object):
def __init__(self, scale=0.1, anisotropic=False, prob=1.):
self.scale = scale
self.anisotropic = anisotropic
self.prob = prob
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.prob:
scale = np.random.uniform(1 - self.scale, 1 + self.scale, 3 if self.anisotropic else 1)
coord *= scale
return coord, feat, label
class RandomShift(object):
def __init__(self, shift=(0.2, 0.2, 0), p=0.95):
self.shift = shift
self.p = p
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
shift_x = np.random.uniform(-self.shift[0], self.shift[0])
shift_y = np.random.uniform(-self.shift[1], self.shift[1])
shift_z = np.random.uniform(-self.shift[2], self.shift[2])
coord += [shift_x, shift_y, shift_z]
return coord, feat, label
class RandomFlip(object):
def __init__(self, p=1.):
self.p = p
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
if np.random.rand() < 0.5:
coord[:, 0] = -coord[:, 0]
if np.random.rand() < 0.5:
coord[:, 1] = -coord[:, 1]
return coord, feat, label
class RandomJitter(object):
def __init__(self, sigma=0.01, clip=0.05, p=1., is_lidar=False):
self.sigma = sigma
self.clip = clip
self.p = p
self.is_lidar = is_lidar
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
assert (self.clip > 0)
jitter = np.clip(self.sigma * np.random.randn(coord.shape[0], 3), -1 * self.clip, self.clip)
if self.is_lidar:
jitter[:, 2] *= 0.1 # re-scale z-axis jitter
coord += jitter
return coord, feat, label
class ChromaticAutoContrast(object):
def __init__(self, p=0.2, blend_factor=None):
self.p = p
self.blend_factor = blend_factor
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
tmp_feat = feat[mask] if mask is not None else feat
lo = np.min(tmp_feat, 0, keepdims=True)
hi = np.max(tmp_feat, 0, keepdims=True)
scale = 255 / (hi - lo)
contrast_feat = (tmp_feat[:, :3] - lo) * scale
blend_factor = np.random.rand() if self.blend_factor is None else self.blend_factor
tmp_feat[:, :3] = (1 - blend_factor) * tmp_feat[:, :3] + blend_factor * contrast_feat
if mask is not None:
feat[mask] = tmp_feat
else:
feat = tmp_feat
return coord, feat, label
class ChromaticTranslation(object):
def __init__(self, p=0.95, ratio=0.05):
self.p = p
self.ratio = ratio
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
tr = (np.random.rand(1, feat.shape[1]) - 0.5) * 255 * 2 * self.ratio
feat[:, :3] = np.clip(tr + feat[:, :3], 0, 255)
if mask is not None:
feat[:, :3][~mask] = 0.
return coord, feat, label
class ChromaticJitter(object):
def __init__(self, p=0.95, std=0.005):
self.p = p
self.std = std
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
noise = np.random.randn(*feat.shape)
noise *= self.std * 255
feat[:, :3] = np.clip(noise + feat[:, :3], 0, 255)
if mask is not None:
feat[:, :3][~mask] = 0.
return coord, feat, label
class HueSaturationTranslation(object):
@staticmethod
def rgb_to_hsv(rgb):
# Translated from source of colorsys.rgb_to_hsv
# r,g,b should be a numpy arrays with values between 0 and 255
# rgb_to_hsv returns an array of floats between 0.0 and 1.0.
rgb = rgb.astype('float')
hsv = np.zeros_like(rgb)
# in case an RGBA array was passed, just copy the A channel
hsv[..., 3:] = rgb[..., 3:]
r, g, b = rgb[..., 0], rgb[..., 1], rgb[..., 2]
maxc = np.max(rgb[..., :3], axis=-1)
minc = np.min(rgb[..., :3], axis=-1)
hsv[..., 2] = maxc
mask = maxc != minc
hsv[mask, 1] = (maxc - minc)[mask] / maxc[mask]
rc = np.zeros_like(r)
gc = np.zeros_like(g)
bc = np.zeros_like(b)
rc[mask] = (maxc - r)[mask] / (maxc - minc)[mask]
gc[mask] = (maxc - g)[mask] / (maxc - minc)[mask]
bc[mask] = (maxc - b)[mask] / (maxc - minc)[mask]
hsv[..., 0] = np.select([r == maxc, g == maxc], [bc - gc, 2.0 + rc - bc], default=4.0 + gc - rc)
hsv[..., 0] = (hsv[..., 0] / 6.0) % 1.0
return hsv
@staticmethod
def hsv_to_rgb(hsv):
# Translated from source of colorsys.hsv_to_rgb
# h,s should be a numpy arrays with values between 0.0 and 1.0
# v should be a numpy array with values between 0.0 and 255.0
# hsv_to_rgb returns an array of uints between 0 and 255.
rgb = np.empty_like(hsv)
rgb[..., 3:] = hsv[..., 3:]
h, s, v = hsv[..., 0], hsv[..., 1], hsv[..., 2]
i = (h * 6.0).astype('uint8')
f = (h * 6.0) - i
p = v * (1.0 - s)
q = v * (1.0 - s * f)
t = v * (1.0 - s * (1.0 - f))
i = i % 6
conditions = [s == 0.0, i == 1, i == 2, i == 3, i == 4, i == 5]
rgb[..., 0] = np.select(conditions, [v, q, p, p, t, v], default=v)
rgb[..., 1] = np.select(conditions, [v, v, v, q, p, p], default=t)
rgb[..., 2] = np.select(conditions, [v, p, t, v, v, q], default=p)
return rgb.astype('uint8')
def __init__(self, hue_max=0.5, saturation_max=0.2, p=1.):
self.hue_max = hue_max
self.saturation_max = saturation_max
self.p = p
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
# Assume feat[:, :3] is rgb
tmp_feat = feat[mask] if mask is not None else feat
hsv = HueSaturationTranslation.rgb_to_hsv(tmp_feat[:, :3])
hue_val = (np.random.rand() - 0.5) * 2 * self.hue_max
sat_ratio = 1 + (np.random.rand() - 0.5) * 2 * self.saturation_max
hsv[..., 0] = np.remainder(hue_val + hsv[..., 0] + 1, 1)
hsv[..., 1] = np.clip(sat_ratio * hsv[..., 1], 0, 1)
tmp_feat[:, :3] = np.clip(HueSaturationTranslation.hsv_to_rgb(hsv), 0, 255)
if mask is not None:
feat[mask] = tmp_feat
else:
feat = tmp_feat
return coord, feat, label
class RandomDropColor(object):
def __init__(self, p=0.2):
self.p = p
def __call__(self, coord, feat, label, mask=None):
if np.random.rand() < self.p:
feat[:, :3] = 0
return coord, feat, label
================================================
FILE: segmentation/modules/pointnet2_utils.py
================================================
"""
Author: Haoxi Ran
Date: 06/30/2022
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from modules.pointops.functions import pointops
def sample_and_group(stride, nsample, xyz, points, offset, return_idx=False, num_sector=1):
# sample
if stride > 1:
new_offset, sample_idx = [offset[0].item() // stride], offset[0].item() // stride
for i in range(1, offset.shape[0]):
sample_idx += (offset[i].item() - offset[i - 1].item()) // stride
new_offset.append(sample_idx)
new_offset = torch.cuda.IntTensor(new_offset)
if num_sector > 1:
fps_idx = pointops.sectorized_fps(xyz, offset, new_offset, num_sector) # [M]
else:
fps_idx = pointops.furthestsampling(xyz, offset, new_offset) # [M]
new_xyz = xyz[fps_idx.long(), :] # [M, 3]
else:
new_xyz = xyz
new_offset = offset
# group
N, M = xyz.shape[0], new_xyz.shape[0]
group_idx, _ = pointops.knnquery(nsample, xyz, new_xyz, offset, new_offset) # [M, nsample]
group_xyz = xyz[group_idx.view(-1).long(), :].view(M, nsample, 3) # [M, nsample, 3]
group_xyz_norm = group_xyz - new_xyz.unsqueeze(1)
if points is not None and not return_idx:
C = points.shape[1]
group_points = points[group_idx.view(-1).long(), :].view(M, nsample, C)
new_points = torch.cat([group_xyz_norm, group_points], dim=-1) # [M, nsample, 3/6+C]
else:
new_points = group_xyz_norm
if return_idx:
return new_xyz, new_points, new_offset, group_idx
else:
return new_xyz, new_points, new_offset
class PointNetSetAbstraction(nn.Module):
"""
PointNet2 SA Module
"""
def __init__(self, stride, nsample, in_channel, mlp, num_sector=1):
super(PointNetSetAbstraction, self).__init__()
self.stride = stride
self.nsample = nsample
self.num_sector = num_sector
self.mlp_convs = nn.ModuleList()
self.mlp_bns = nn.ModuleList()
last_channel = in_channel
for out_channel in mlp:
self.mlp_convs.append(nn.Conv1d(last_channel, out_channel, 1))
self.mlp_bns.append(nn.BatchNorm1d(out_channel))
last_channel = out_channel
def forward(self, pos_feat_off):
xyz, points, offset = pos_feat_off # [N, 3], [N, C], [B]
new_xyz, new_points, new_offset = sample_and_group(self.stride, self.nsample, xyz, points, offset,
num_sector=self.num_sector)
# new_xyz: sampled points position data, [M, 3]
# new_points: sampled points data, [M, nsample, 3+C]
new_points = new_points.transpose(1, 2).contiguous() # [M, 3+C, nsample]
for i, conv in enumerate(self.mlp_convs):
bn = self.mlp_bns[i]
new_points = F.relu(bn(conv(new_points)))
new_points = torch.max(new_points, 2)[0]
return [new_xyz, new_points, new_offset]
class PointNetFeaturePropagation(nn.Module):
"""
PointNet2 FP Module
"""
def __init__(self, in_channel, mlp):
super(PointNetFeaturePropagation, self).__init__()
self.mlp_convs = nn.ModuleList()
self.mlp_bns = nn.ModuleList()
last_channel = in_channel
for out_channel in mlp:
self.mlp_convs.append(nn.Linear(last_channel, out_channel))
self.mlp_bns.append(nn.BatchNorm1d(out_channel))
last_channel = out_channel
def forward(self, pos_feat_off1, pos_feat_off2):
xyz1, points1, offset1 = pos_feat_off1 # [N, 3], [N, C], [B]
xyz2, points2, offset2 = pos_feat_off2 # [M, 3], [M, C], [B]
idx, dist = pointops.knnquery(3, xyz2, xyz1, offset2, offset1) # [M, 3], [M, 3]
dist_recip = 1.0 / (dist + 1e-8) # [M, 3]
norm = torch.sum(dist_recip, dim=1, keepdim=True)
weight = dist_recip / norm # [M, 3]
interpolated_points = torch.cuda.FloatTensor(xyz1.shape[0], points2.shape[1]).zero_()
for i in range(3):
interpolated_points += points2[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
# skip connection
if points1 is not None:
new_points = torch.cat([points1, interpolated_points], dim=1) # [M, C1+C2]
else:
new_points = interpolated_points
# mlp
for i, conv in enumerate(self.mlp_convs):
bn = self.mlp_bns[i]
new_points = F.relu(bn(conv(new_points)))
return new_points
================================================
FILE: segmentation/modules/pointops/__init__.py
================================================
================================================
FILE: segmentation/modules/pointops/functions/__init__.py
================================================
================================================
FILE: segmentation/modules/pointops/functions/pointops.py
================================================
from typing import Tuple
import torch
from torch.autograd import Function
import torch.nn as nn
try:
import pointops_cuda
except ImportError:
import warnings
import os
from torch.utils.cpp_extension import load
warnings.warn("Unable to load pointops_cuda cpp extension.")
pointops_cuda_src = os.path.join(os.path.dirname(__file__), "../src")
pointops_cuda = load('pointops_cuda', [
pointops_cuda_src + '/pointops_api.cpp',
pointops_cuda_src + '/knnquery/knnquery_cuda.cpp',
pointops_cuda_src + '/knnquery/knnquery_cuda_kernel.cu',
pointops_cuda_src + '/interpolation/interpolation_cuda.cpp',
pointops_cuda_src + '/interpolation/interpolation_cuda_kernel.cu',
pointops_cuda_src + '/sampling/sampling_cuda.cpp',
pointops_cuda_src + '/sampling/sampling_cuda_kernel.cu',
pointops_cuda_src + '/subtraction/subtraction_cuda.cpp',
pointops_cuda_src + '/subtraction/subtraction_cuda_kernel.cu',
pointops_cuda_src + '/aggregation/aggregation_cuda.cpp',
pointops_cuda_src + '/aggregation/aggregation_cuda_kernel.cu',
], build_directory=pointops_cuda_src, verbose=False)
class FurthestSampling(Function):
@staticmethod
def forward(ctx, xyz, offset, new_offset):
"""
input: xyz: (n, 3), offset: (b), new_offset: (b)
output: idx: (m)
"""
assert xyz.is_contiguous()
n, b, n_max = xyz.shape[0], offset.shape[0], offset[0]
for i in range(1, b):
n_max = max(offset[i] - offset[i - 1], n_max)
idx = torch.cuda.IntTensor(new_offset[b - 1].item()).zero_()
tmp = torch.cuda.FloatTensor(n).fill_(1e10)
pointops_cuda.furthestsampling_cuda(b, n_max, xyz, offset, new_offset, tmp, idx)
del tmp
return idx
furthestsampling = FurthestSampling.apply
class SectorizedFurthestSampling(Function):
@staticmethod
def forward(ctx, xyz, offset, new_offset, num_sectors, min_points=10000):
"""
input: xyz: (n, 3), offset: (b), new_offset: (b)
output: idx: (m)
"""
assert xyz.is_contiguous()
# cut into batches
last_offset = 0
sizes = []
new_sizes = []
indices = []
for i in range(offset.shape[0]):
size = offset[i] - last_offset
if size < min_points:
tmp_num_sectors = 1
else:
tmp_num_sectors = num_sectors
batch_xyz = xyz[last_offset:last_offset + size]
angle = torch.atan2(batch_xyz[:, 0], batch_xyz[:, 1]) # [0, 2*pi]
sector_range = torch.linspace(angle.min(), angle.max() + 1e-4, tmp_num_sectors + 1)
for s in range(tmp_num_sectors):
indices.append(
torch.where((angle >= sector_range[s]) & (angle < sector_range[s + 1]))[0] + last_offset
)
sizes.append(indices[-1].shape[0])
if i > 0:
new_size = (new_offset[i] - new_offset[i - 1]).item()
else:
new_size = new_offset[i].item()
new_sizes_this_batch = [new_size // tmp_num_sectors for i in range(tmp_num_sectors)]
new_sizes_this_batch[-1] += new_size % tmp_num_sectors
new_sizes += new_sizes_this_batch
last_offset = offset[i]
sizes = torch.tensor(sizes, dtype=torch.long).to(offset)
sector_offset = sizes.cumsum(dim=0)
new_sizes = torch.tensor(new_sizes, dtype=torch.long).to(offset)
new_sector_offset = new_sizes.cumsum(dim=0)
indices = torch.cat(indices).long().to(offset.device)
sector_xyz = xyz[indices].contiguous()
# transform to sectors
new_xyz = []
n, b, n_max = sector_xyz.shape[0], sector_offset.shape[0], sector_offset[0]
for i in range(1, b):
n_max = max(sector_offset[i] - sector_offset[i - 1], n_max)
idx = torch.cuda.IntTensor(new_sector_offset[b - 1].item()).zero_()
tmp = torch.cuda.FloatTensor(n).fill_(1e10)
pointops_cuda.furthestsampling_cuda(b, n_max, sector_xyz, sector_offset.int(), new_sector_offset.int(), tmp,
idx)
idx = indices[idx.long()]
del tmp
del sector_xyz
return idx
sectorized_fps = SectorizedFurthestSampling.apply
class KNNQuery(Function):
@staticmethod
def forward(ctx, nsample, xyz, new_xyz, offset, new_offset):
"""
input: xyz: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
output: idx: (m, nsample), dist2: (m, nsample)
"""
if new_xyz is None: new_xyz = xyz
assert xyz.is_contiguous() and new_xyz.is_contiguous()
m = new_xyz.shape[0]
idx = torch.cuda.IntTensor(m, nsample).zero_()
dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
pointops_cuda.knnquery_cuda(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2)
return idx, torch.sqrt(dist2)
knnquery = KNNQuery.apply
class Grouping(Function):
@staticmethod
def forward(ctx, input, idx):
"""
input: input: (n, c), idx : (m, nsample)
output: (m, nsample, c)
"""
assert input.is_contiguous() and idx.is_contiguous()
m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
output = torch.cuda.FloatTensor(m, nsample, c)
pointops_cuda.grouping_forward_cuda(m, nsample, c, input, idx, output)
ctx.n = n
ctx.save_for_backward(idx)
return output
@staticmethod
def backward(ctx, grad_output):
"""
input: grad_out: (m, c, nsample)
output: (n, c), None
"""
n = ctx.n
idx, = ctx.saved_tensors
m, nsample, c = grad_output.shape
grad_input = torch.cuda.FloatTensor(n, c).zero_()
pointops_cuda.grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
return grad_input, None
grouping = Grouping.apply
def queryandgroup(nsample, xyz, new_xyz, feat, idx, offset, new_offset, use_xyz=True):
"""
input: xyz: (n, 3), new_xyz: (m, 3), feat: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
output: new_feat: (m, c+3, nsample), grouped_idx: (m, nsample)
"""
assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
if new_xyz is None:
new_xyz = xyz
if idx is None:
idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample)
n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3)
# grouped_xyz = grouping(xyz, idx) # (m, nsample, 3)
grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3)
grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c)
# grouped_feat = grouping(feat, idx) # (m, nsample, c)
if use_xyz:
return torch.cat((grouped_xyz, grouped_feat), -1) # (m, nsample, 3+c)
else:
return grouped_feat
class Subtraction(Function):
@staticmethod
def forward(ctx, input1, input2, idx):
"""
input: input1: (n, c), input2: (n, c), idx: (n, nsample)
output: (n, nsample, c)
"""
assert input1.is_contiguous() and input2.is_contiguous()
n, c = input1.shape;
nsample = idx.shape[-1]
output = torch.cuda.FloatTensor(n, nsample, c).zero_()
pointops_cuda.subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
ctx.save_for_backward(idx)
return output
@staticmethod
def backward(ctx, grad_output):
"""
input: grad_out: (n, nsample, c)
output: grad_input1: (n, c), grad_input2: (n, c)
"""
idx, = ctx.saved_tensors
n, nsample, c = grad_output.shape
grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
pointops_cuda.subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
return grad_input1, grad_input2, None
subtraction = Subtraction.apply
class Aggregation(Function):
@staticmethod
def forward(ctx, input, position, weight, idx):
"""
input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
output: (n, c)
"""
assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
n, nsample, c = position.shape;
w_c = weight.shape[-1]
output = torch.cuda.FloatTensor(n, c).zero_()
pointops_cuda.aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
ctx.save_for_backward(input, position, weight, idx)
return output
@staticmethod
def backward(ctx, grad_output):
"""
input: grad_out: (n, c)
output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
"""
input, position, weight, idx = ctx.saved_tensors
n, nsample, c = position.shape;
w_c = weight.shape[-1]
grad_input = torch.cuda.FloatTensor(n, c).zero_()
grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
pointops_cuda.aggregation_backward_cuda(n, nsample, c, w_c, input, position, weight, idx, grad_output,
grad_input, grad_position, grad_weight)
return grad_input, grad_position, grad_weight, None
aggregation = Aggregation.apply
def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
"""
input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
output: (n, c)
"""
assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
norm = torch.sum(dist_recip, dim=1, keepdim=True)
weight = dist_recip / norm # (n, 3)
new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
for i in range(k):
new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
return new_feat
class Interpolation(Function):
@staticmethod
def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
"""
input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
output: (n, c)
"""
assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, k), (n, k)
dist_recip = 1.0 / (dist + 1e-8) # (n, k)
norm = torch.sum(dist_recip, dim=1, keepdim=True)
weight = dist_recip / norm # (n, k)
n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
output = torch.cuda.FloatTensor(n, c).zero_()
pointops_cuda.interpolation_forward_cuda(n, c, k, input, idx, weight, output)
ctx.m, ctx.k = m, k
ctx.save_for_backward(idx, weight)
return output
@staticmethod
def backward(ctx, grad_output):
"""
input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
output: (n, c)
"""
m, k = ctx.m, ctx.k
idx, weight = ctx.saved_tensors
n, c = grad_output.shape
grad_input = torch.cuda.FloatTensor(m, c).zero_()
pointops_cuda.interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
return None, None, grad_input, None, None, None
interpolation2 = Interpolation.apply
================================================
FILE: segmentation/modules/pointops/setup.py
================================================
#python3 setup.py install
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import os
from distutils.sysconfig import get_config_vars
(opt,) = get_config_vars('OPT')
os.environ['OPT'] = " ".join(
flag for flag in opt.split() if flag != '-Wstrict-prototypes'
)
setup(
name='pointops_cuda',
author='Hengshuang Zhao',
ext_modules=[
CUDAExtension('pointops_cuda', [
'src/pointops_api.cpp',
'src/knnquery/knnquery_cuda.cpp',
'src/knnquery/knnquery_cuda_kernel.cu',
'src/sampling/sampling_cuda.cpp',
'src/sampling/sampling_cuda_kernel.cu',
'src/grouping/grouping_cuda.cpp',
'src/grouping/grouping_cuda_kernel.cu',
'src/interpolation/interpolation_cuda.cpp',
'src/interpolation/interpolation_cuda_kernel.cu',
'src/subtraction/subtraction_cuda.cpp',
'src/subtraction/subtraction_cuda_kernel.cu',
'src/aggregation/aggregation_cuda.cpp',
'src/aggregation/aggregation_cuda_kernel.cu',
],
extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}
)
],
cmdclass={'build_ext': BuildExtension}
)
================================================
FILE: segmentation/modules/pointops/src/__init__.py
================================================
================================================
FILE: segmentation/modules/pointops/src/aggregation/aggregation_cuda.cpp
================================================
#include
#include
#include
#include
#include "aggregation_cuda_kernel.h"
void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
{
const float *input = input_tensor.data_ptr();
const float *position = position_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *output = output_tensor.data_ptr();
aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output);
}
void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor)
{
const float *input = input_tensor.data_ptr();
const float *position = position_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
const float *grad_output = grad_output_tensor.data_ptr();
float *grad_input = grad_input_tensor.data_ptr();
float *grad_position = grad_position_tensor.data_ptr();
float *grad_weight = grad_weight_tensor.data_ptr();
aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
}
================================================
FILE: segmentation/modules/pointops/src/aggregation/aggregation_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "aggregation_cuda_kernel.h"
__global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
// input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * c) return;
const int c_idx = index % c;
const int n_idx = index / c;
const int w_c_idx = c_idx % w_c;
for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
{
int idx_idx = n_idx * nsample + nsample_idx;
int input_idx = idx[idx_idx] * c + c_idx;
int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx];
}
}
__global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
// input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * c) return;
const int c_idx = index % c;
const int n_idx = index / c;
const int w_c_idx = c_idx % w_c;
for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
{
int idx_idx = n_idx * nsample + nsample_idx;
int input_idx = idx[idx_idx] * c + c_idx;
int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]);
grad_position[position_idx] = grad_output[index] * weight[weight_idx];
atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx]));
}
}
void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
// input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
aggregation_forward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, output);
}
void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
// input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
aggregation_backward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
}
================================================
FILE: segmentation/modules/pointops/src/aggregation/aggregation_cuda_kernel.h
================================================
#ifndef _AGGREGATION_CUDA_KERNEL
#define _AGGREGATION_CUDA_KERNEL
#include
#include
#include
void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output);
void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: segmentation/modules/pointops/src/cuda_utils.h
================================================
#ifndef _CUDA_UTILS_H
#define _CUDA_UTILS_H
#include
#include
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
inline int opt_n_threads(int work_size) {
const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0);
return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1);
}
inline dim3 opt_block_config(int x, int y) {
const int x_threads = opt_n_threads(x);
const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
dim3 block_config(x_threads, y_threads, 1);
return block_config;
}
#endif
================================================
FILE: segmentation/modules/pointops/src/grouping/grouping_cuda.cpp
================================================
#include
#include
#include
#include
#include "grouping_cuda_kernel.h"
void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
{
const float *input = input_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *output = output_tensor.data_ptr();
grouping_forward_cuda_launcher(m, nsample, c, input, idx, output);
}
void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor)
{
const float *grad_output = grad_output_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *grad_input = grad_input_tensor.data_ptr();
grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input);
}
================================================
FILE: segmentation/modules/pointops/src/grouping/grouping_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "grouping_cuda_kernel.h"
__global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) {
// input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= m * nsample * c) return;
const int c_idx = index % c;
const int nsample_idx = (index / c) % nsample;
const int m_idx = index / nsample / c;
const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
output[index] = input[input_idx];
}
__global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) {
// input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= m * nsample * c) return;
const int c_idx = index % c;
const int nsample_idx = (index / c) % nsample;
const int m_idx = index / nsample / c;
const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
atomicAdd(grad_input + input_idx, grad_output[index]);
}
void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) {
// input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
grouping_forward_cuda_kernel<<>>(m, nsample, c, input, idx, output);
}
void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input)
{
// input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
grouping_backward_cuda_kernel<<>>(m, nsample, c, grad_output, idx, grad_input);
}
================================================
FILE: segmentation/modules/pointops/src/grouping/grouping_cuda_kernel.h
================================================
#ifndef _GROUPING_CUDA_KERNEL
#define _GROUPING_CUDA_KERNEL
#include
#include
#include
void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output);
void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: segmentation/modules/pointops/src/interpolation/interpolation_cuda.cpp
================================================
#include
#include
#include
#include
#include "interpolation_cuda_kernel.h"
void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor)
{
const float *input = input_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *output = output_tensor.data_ptr();
interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output);
}
void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor)
{
const float *grad_output = grad_output_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *grad_input = grad_input_tensor.data_ptr();
interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input);
}
================================================
FILE: segmentation/modules/pointops/src/interpolation/interpolation_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "interpolation_cuda_kernel.h"
__global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output)
{
// input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * c) return;
int c_idx = index % c;
int n_idx = index / c;
for (int i = 0; i < k; i++)
{
int idx_idx = n_idx * k + i;
int input_idx = idx[idx_idx] * c + c_idx;
output[index] += input[input_idx] * weight[idx_idx];
}
}
__global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input)
{
// input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * c) return;
int c_idx = index % c;
int n_idx = index / c;
for (int i = 0; i < k; i++)
{
int idx_idx = n_idx * k + i;
int input_idx = idx[idx_idx] * c + c_idx;
atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]);
}
}
void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) {
// input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
interpolation_forward_cuda_kernel<<>>(n, c, k, input, idx, weight, output);
}
void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) {
// input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
interpolation_backward_cuda_kernel<<>>(n, c, k, grad_output, idx, weight, grad_input);
}
================================================
FILE: segmentation/modules/pointops/src/interpolation/interpolation_cuda_kernel.h
================================================
#ifndef _INTERPOLATION_CUDA_KERNEL
#define _INTERPOLATION_CUDA_KERNEL
#include
#include
#include
void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor);
void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output);
void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: segmentation/modules/pointops/src/knnquery/knnquery_cuda.cpp
================================================
#include
#include
#include
#include
#include "knnquery_cuda_kernel.h"
void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
{
const float *xyz = xyz_tensor.data_ptr();
const float *new_xyz = new_xyz_tensor.data_ptr();
const int *offset = offset_tensor.data_ptr();
const int *new_offset = new_offset_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
}
================================================
FILE: segmentation/modules/pointops/src/knnquery/knnquery_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "knnquery_cuda_kernel.h"
__device__ void swap_float(float *x, float *y)
{
float tmp = *x;
*x = *y;
*y = tmp;
}
__device__ void swap_int(int *x, int *y)
{
int tmp = *x;
*x = *y;
*y = tmp;
}
__device__ void reheap(float *dist, int *idx, int k)
{
int root = 0;
int child = root * 2 + 1;
while (child < k)
{
if(child + 1 < k && dist[child+1] > dist[child])
child++;
if(dist[root] > dist[child])
return;
swap_float(&dist[root], &dist[child]);
swap_int(&idx[root], &idx[child]);
root = child;
child = root * 2 + 1;
}
}
__device__ void heap_sort(float *dist, int *idx, int k)
{
int i;
for (i = k - 1; i > 0; i--)
{
swap_float(&dist[0], &dist[i]);
swap_int(&idx[0], &idx[i]);
reheap(dist, idx, i);
}
}
__device__ int get_bt_idx(int idx, const int *offset)
{
int i = 0;
while (1)
{
if (idx < offset[i])
break;
else
i++;
}
return i;
}
__global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) {
// input: xyz (n, 3) new_xyz (m, 3)
// output: idx (m, nsample) dist2 (m, nsample)
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (pt_idx >= m) return;
new_xyz += pt_idx * 3;
idx += pt_idx * nsample;
dist2 += pt_idx * nsample;
int bt_idx = get_bt_idx(pt_idx, new_offset);
int start;
if (bt_idx == 0)
start = 0;
else
start = offset[bt_idx - 1];
int end = offset[bt_idx];
float new_x = new_xyz[0];
float new_y = new_xyz[1];
float new_z = new_xyz[2];
float best_dist[100];
int best_idx[100];
for(int i = 0; i < nsample; i++){
best_dist[i] = 1e10;
best_idx[i] = start;
}
for(int i = start; i < end; i++){
float x = xyz[i * 3 + 0];
float y = xyz[i * 3 + 1];
float z = xyz[i * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
if (d2 < best_dist[0]){
best_dist[0] = d2;
best_idx[0] = i;
reheap(best_dist, best_idx, nsample);
}
}
heap_sort(best_dist, best_idx, nsample);
for(int i = 0; i < nsample; i++){
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
}
}
void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) {
// input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
knnquery_cuda_kernel<<>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
}
================================================
FILE: segmentation/modules/pointops/src/knnquery/knnquery_cuda_kernel.h
================================================
#ifndef _KNNQUERY_CUDA_KERNEL
#define _KNNQUERY_CUDA_KERNEL
#include
#include
#include
void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: segmentation/modules/pointops/src/pointops_api.cpp
================================================
#include
#include
#include "knnquery/knnquery_cuda_kernel.h"
#include "sampling/sampling_cuda_kernel.h"
#include "grouping/grouping_cuda_kernel.h"
#include "interpolation/interpolation_cuda_kernel.h"
#include "aggregation/aggregation_cuda_kernel.h"
#include "subtraction/subtraction_cuda_kernel.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda");
m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda");
m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda");
m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda");
m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda");
m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda");
m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda");
m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda");
}
================================================
FILE: segmentation/modules/pointops/src/sampling/sampling_cuda.cpp
================================================
#include
#include
#include
#include
#include "sampling_cuda_kernel.h"
void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor)
{
const float *xyz = xyz_tensor.data_ptr();
const int *offset = offset_tensor.data_ptr();
const int *new_offset = new_offset_tensor.data_ptr();
float *tmp = tmp_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx);
}
================================================
FILE: segmentation/modules/pointops/src/sampling/sampling_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "sampling_cuda_kernel.h"
__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) {
const float v1 = dists[idx1], v2 = dists[idx2];
const int i1 = dists_i[idx1], i2 = dists_i[idx2];
dists[idx1] = max(v1, v2);
dists_i[idx1] = v2 > v1 ? i2 : i1;
}
// input xyz: (n, 3), tmp: (b, n_max)
// ouput idx (m)
template
__global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
{
__shared__ float dists[block_size];
__shared__ int dists_i[block_size];
int bid = blockIdx.x;
int start_n, end_n, start_m, end_m, old;
if (bid == 0) {
start_n = 0;
end_n = offset[0];
start_m = 0;
end_m = new_offset[0];
old = 0;
}
else {
start_n = offset[bid - 1];
end_n = offset[bid];
start_m = new_offset[bid - 1];
end_m = new_offset[bid];
old = offset[bid - 1];
}
const int stride = block_size;
int tid = threadIdx.x;
if (tid == 0) idx[start_m] = start_n;
__syncthreads();
for (int j = start_m + 1; j < end_m; j++)
{
int besti = start_n;
float best = -1;
float x1 = xyz[old * 3 + 0];
float y1 = xyz[old * 3 + 1];
float z1 = xyz[old * 3 + 2];
for (int k = start_n + tid; k < end_n; k += stride)
{
float x2 = xyz[k * 3 + 0];
float y2 = xyz[k * 3 + 1];
float z2 = xyz[k * 3 + 2];
float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
float d2 = min(d, tmp[k]);
tmp[k] = d2;
besti = d2 > best ? k : besti;
best = d2 > best ? d2 : best;
}
dists[tid] = best;
dists_i[tid] = besti;
__syncthreads();
if (block_size >= 1024) {
if (tid < 512) {
__update(dists, dists_i, tid, tid + 512);
}
__syncthreads();
}
if (block_size >= 512) {
if (tid < 256) {
__update(dists, dists_i, tid, tid + 256);
}
__syncthreads();
}
if (block_size >= 256) {
if (tid < 128) {
__update(dists, dists_i, tid, tid + 128);
}
__syncthreads();
}
if (block_size >= 128) {
if (tid < 64) {
__update(dists, dists_i, tid, tid + 64);
}
__syncthreads();
}
if (block_size >= 64) {
if (tid < 32) {
__update(dists, dists_i, tid, tid + 32);
}
__syncthreads();
}
if (block_size >= 32) {
if (tid < 16) {
__update(dists, dists_i, tid, tid + 16);
}
__syncthreads();
}
if (block_size >= 16) {
if (tid < 8) {
__update(dists, dists_i, tid, tid + 8);
}
__syncthreads();
}
if (block_size >= 8) {
if (tid < 4) {
__update(dists, dists_i, tid, tid + 4);
}
__syncthreads();
}
if (block_size >= 4) {
if (tid < 2) {
__update(dists, dists_i, tid, tid + 2);
}
__syncthreads();
}
if (block_size >= 2) {
if (tid < 1) {
__update(dists, dists_i, tid, tid + 1);
}
__syncthreads();
}
old = dists_i[0];
if (tid == 0)
idx[j] = old;
}
}
void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
{
unsigned int n_threads = opt_n_threads(n);
switch (n_threads) {
case 1024:
furthestsampling_cuda_kernel<1024><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 512:
furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 256:
furthestsampling_cuda_kernel<256><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 128:
furthestsampling_cuda_kernel<128><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 64:
furthestsampling_cuda_kernel<64><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 32:
furthestsampling_cuda_kernel<32><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 16:
furthestsampling_cuda_kernel<16><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 8:
furthestsampling_cuda_kernel<8><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 4:
furthestsampling_cuda_kernel<4><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 2:
furthestsampling_cuda_kernel<2><<>>(xyz, offset, new_offset, tmp, idx);
break;
case 1:
furthestsampling_cuda_kernel<1><<>>(xyz, offset, new_offset, tmp, idx);
break;
default:
furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx);
}
}
================================================
FILE: segmentation/modules/pointops/src/sampling/sampling_cuda_kernel.h
================================================
#ifndef _SAMPLING_CUDA_KERNEL
#define _SAMPLING_CUDA_KERNEL
#include
#include
#include
void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor);
#ifdef __cplusplus
extern "C" {
#endif
void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: segmentation/modules/pointops/src/subtraction/subtraction_cuda.cpp
================================================
#include
#include
#include
#include
#include "subtraction_cuda_kernel.h"
void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
{
const float *input1 = input1_tensor.data_ptr();
const float *input2 = input2_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *output = output_tensor.data_ptr();
subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output);
}
void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor)
{
const int *idx = idx_tensor.data_ptr();
const float *grad_output = grad_output_tensor.data_ptr();
float *grad_input1 = grad_input1_tensor.data_ptr();
float *grad_input2 = grad_input2_tensor.data_ptr();
subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
}
================================================
FILE: segmentation/modules/pointops/src/subtraction/subtraction_cuda_kernel.cu
================================================
#include "../cuda_utils.h"
#include "subtraction_cuda_kernel.h"
__global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
// input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * nsample * c) return;
const int c_idx = index % c;
const int nsample_idx = (index / c) % nsample;
const int n_idx = index / nsample / c;
const int idx_idx = n_idx * nsample + nsample_idx;
const int input1_idx = n_idx * c + c_idx;
const int input2_idx = idx[idx_idx] * c + c_idx;
output[index] = input1[input1_idx] - input2[input2_idx];
}
__global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
// input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n * nsample * c) return;
const int c_idx = index % c;
const int nsample_idx = (index / c) % nsample;
const int n_idx = index / nsample / c;
const int idx_idx = n_idx * nsample + nsample_idx;
const int input1_idx = n_idx * c + c_idx;
const int input2_idx = idx[idx_idx] * c + c_idx;
atomicAdd(grad_input1 + input1_idx, grad_output[index]);
atomicAdd(grad_input2 + input2_idx, -grad_output[index]);
}
void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
// input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
subtraction_forward_cuda_kernel<<>>(n, nsample, c, input1, input2, idx, output);
}
void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
// input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
subtraction_backward_cuda_kernel<<>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
}
================================================
FILE: segmentation/modules/pointops/src/subtraction/subtraction_cuda_kernel.h
================================================
#ifndef _SUBTRACTION_CUDA_KERNEL
#define _SUBTRACTION_CUDA_KERNEL
#include
#include