Repository: hszhao/PointWeb Branch: master Commit: f31fe05616c3 Files: 61 Total size: 218.5 KB Directory structure: gitextract_6_2c5ysk/ ├── .gitignore ├── LICENSE ├── README.md ├── data/ │ ├── s3dis/ │ │ └── s3dis_names.txt │ └── scannet/ │ └── scannet_names.txt ├── lib/ │ ├── __init__.py │ ├── pointops/ │ │ ├── __init__.py │ │ ├── functions/ │ │ │ ├── __init__.py │ │ │ └── pointops.py │ │ ├── setup.py │ │ └── src/ │ │ ├── __init__.py │ │ ├── ballquery/ │ │ │ ├── ballquery_cuda.cpp │ │ │ ├── ballquery_cuda_kernel.cu │ │ │ └── ballquery_cuda_kernel.h │ │ ├── cuda_utils.h │ │ ├── featuredistribute/ │ │ │ ├── featuredistribute_cuda.cpp │ │ │ ├── featuredistribute_cuda_kernel.cu │ │ │ └── featuredistribute_cuda_kernel.h │ │ ├── grouping/ │ │ │ ├── grouping_cuda.cpp │ │ │ ├── grouping_cuda_kernel.cu │ │ │ └── grouping_cuda_kernel.h │ │ ├── grouping_int/ │ │ │ ├── grouping_int_cuda.cpp │ │ │ ├── grouping_int_cuda_kernel.cu │ │ │ └── grouping_int_cuda_kernel.h │ │ ├── interpolation/ │ │ │ ├── interpolation_cuda.cpp │ │ │ ├── interpolation_cuda_kernel.cu │ │ │ └── interpolation_cuda_kernel.h │ │ ├── knnquery/ │ │ │ ├── __init__.py │ │ │ ├── knnquery_cuda.cpp │ │ │ ├── knnquery_cuda_kernel.cu │ │ │ └── knnquery_cuda_kernel.h │ │ ├── labelstat/ │ │ │ ├── labelstat_cuda.cpp │ │ │ ├── labelstat_cuda_kernel.cu │ │ │ └── labelstat_cuda_kernel.h │ │ ├── pointops_api.cpp │ │ └── sampling/ │ │ ├── sampling_cuda.cpp │ │ ├── sampling_cuda_kernel.cu │ │ └── sampling_cuda_kernel.h │ └── sync_bn/ │ ├── __init__.py │ ├── batchnorm.py │ ├── comm.py │ ├── replicate.py │ └── unittest.py ├── model/ │ ├── __init__.py │ ├── pointnet/ │ │ └── pointnet.py │ ├── pointnet2/ │ │ ├── pointnet2_modules.py │ │ └── pointnet2_seg.py │ └── pointweb/ │ ├── pointweb_module.py │ └── pointweb_seg.py ├── tool/ │ ├── test.sh │ ├── test_s3dis.py │ ├── test_scannet.py │ ├── train.py │ └── train.sh └── util/ ├── config.py ├── dataset.py ├── pt_util.py ├── s3dis.py ├── scannet.py ├── transform.py └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ ## General # Compiled Object files *.slo *.lo *.o *.cuo # Compiled Dynamic libraries *.so *.dylib # Compiled Static libraries *.lai *.la *.a # Compiled protocol buffers *.pb.h *.pb.cc *_pb2.py # Compiled python *.pyc # Compiled MATLAB *.mex* # IPython notebook checkpoints .ipynb_checkpoints # Editor temporaries *.swp *~ # Sublime Text settings *.sublime-workspace *.sublime-project # Eclipse Project settings *.*project .settings # QtCreator files *.user # PyCharm files .idea # Visual Studio Code files .vscode # OSX dir files .DS_Store # personal *.log *.pth *.caffemodel exp/ summary/ __pycache__/ # data/ back/ *.png *.jpg *.log *.pth events* config/ initmodel/ *.ninja_deps *.ninja_log *.ninja *.yaml ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Hengshuang Zhao Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # PointWeb: Enhancing Local Neighborhood Features for Point Cloud Processing by Hengshuang Zhao\*, Li Jiang*, Chi-Wing Fu, and Jiaya Jia, details are in [paper](http://openaccess.thecvf.com/content_CVPR_2019/papers/Zhao_PointWeb_Enhancing_Local_Neighborhood_Features_for_Point_Cloud_Processing_CVPR_2019_paper.pdf). ### Introduction This repository is build for PointWeb in point cloud scene understanding. ### Usage 1. Requirement: - Hardware: 4 GPUs (better with >=11G GPU memory) - Software: PyTorch>=1.0.0, Python3, CUDA>=9.0, [tensorboardX](https://github.com/lanpa/tensorboardX) 2. Clone the repository and build the ops: ```shell git clone https://github.com/hszhao/PointWeb.git cd PointWeb cd lib/pointops && python setup.py install && cd ../../ ``` 3. Train: - Download related [datasets](https://drive.google.com/open?id=1Jpi2IP58zHs6Ppv05kqvwJhBnl-Kge2q) and symlink the paths to them as follows (you can alternatively modify the relevant paths specified in folder `config`): ``` mkdir -p dataset ln -s /path_to_s3dis_dataset dataset/s3dis ``` - Specify the gpu used in config and then do training: ```shell sh tool/train.sh s3dis pointweb ``` 4. Test: - Download trained segmentation models and put them under folder specified in config or modify the specified paths. - For full testing (get listed performance): ```shell sh tool/test.sh s3dis pointweb ``` 5. Visualization: [tensorboardX](https://github.com/lanpa/tensorboardX) incorporated for better visualization. ```shell tensorboard --logdir=run1:$EXP1,run2:$EXP2 --port=6789 ``` 6. Other: - Resources: GoogleDrive [LINK](https://drive.google.com/open?id=1IFoKe5TM3ZO38LT4VXCaHKvCNkXfgtBf) contains shared models, predictions and part of the related datasets. - Video predictions: Youtube [LINK](https://youtu.be/CaobqpsUP_4). ### Performance Description: **mIoU/mAcc/aAcc/voxAcc** stands for mean IoU, mean accuracy of each class, all pixel accuracy , and voxel label accuracy respectively. mIoU/mAcc/aAcc of PointWeb on S3DIS dataset: 0.6055/0.6682/0.8658. mIoU/mAcc/aAcc/voxAcc of PointWeb on ScanNet dataset: 0.5063/0.6061/0.8529/0.8568. ### Citation If you find the code or trained models useful, please consider citing: ``` @inproceedings{zhao2019pointweb, title={{PointWeb}: Enhancing Local Neighborhood Features for Point Cloud Processing}, author={Zhao, Hengshuang and Jiang, Li and Fu, Chi-Wing and Jia, Jiaya}, booktitle={CVPR}, year={2019} } ``` ================================================ FILE: data/s3dis/s3dis_names.txt ================================================ ceiling floor wall beam column window door chair table bookcase sofa board clutter ================================================ FILE: data/scannet/scannet_names.txt ================================================ bathtub bed bookshelf cabinet chair counter curtain desk door floor otherfurniture picture refrigerator showercurtain sink sofa table toilet wall window ================================================ FILE: lib/__init__.py ================================================ ================================================ FILE: lib/pointops/__init__.py ================================================ ================================================ FILE: lib/pointops/functions/__init__.py ================================================ ================================================ FILE: lib/pointops/functions/pointops.py ================================================ from typing import Tuple import torch from torch.autograd import Function import torch.nn as nn import pointops_cuda class FurthestSampling(Function): @staticmethod def forward(ctx, xyz, m): """ input: xyz: (b, n, 3) and n > m, m: int32 output: idx: (b, m) """ assert xyz.is_contiguous() b, n, _ = xyz.size() idx = torch.cuda.IntTensor(b, m) temp = torch.cuda.FloatTensor(b, n).fill_(1e10) pointops_cuda.furthestsampling_cuda(b, n, m, xyz, temp, idx) return idx @staticmethod def backward(xyz, a=None): return None, None furthestsampling = FurthestSampling.apply class Gathering(Function): @staticmethod def forward(ctx, features, idx): """ input: features: (b, c, n), idx : (b, m) tensor output: (b, c, m) """ assert features.is_contiguous() assert idx.is_contiguous() b, c, n = features.size() m = idx.size(1) output = torch.cuda.FloatTensor(b, c, m) pointops_cuda.gathering_forward_cuda(b, c, n, m, features, idx, output) ctx.for_backwards = (idx, c, n) return output @staticmethod def backward(ctx, grad_out): idx, c, n = ctx.for_backwards b, m = idx.size() grad_features = torch.cuda.FloatTensor(b, c, n).zero_() grad_out_data = grad_out.data.contiguous() pointops_cuda.gathering_backward_cuda(b, c, n, m, grad_out_data, idx, grad_features.data) return grad_features, None gathering = Gathering.apply class NearestNeighbor(Function): @staticmethod def forward(ctx, unknown: torch.Tensor, known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Find the three nearest neighbors of unknown in known input: unknown: (b, n, 3), known: (b, m, 3) output: dist2: (b, n, 3) l2 distance to the three nearest neighbors idx: (b, n, 3) index of 3 nearest neighbors """ assert unknown.is_contiguous() assert known.is_contiguous() b, n, _ = unknown.size() m = known.size(1) dist2 = torch.cuda.FloatTensor(b, n, 3) idx = torch.cuda.IntTensor(b, n, 3) pointops_cuda.nearestneighbor_cuda(b, n, m, unknown, known, dist2, idx) return torch.sqrt(dist2), idx @staticmethod def backward(ctx, a=None, b=None): return None, None nearestneighbor = NearestNeighbor.apply class Interpolation(Function): @staticmethod def forward(ctx, features: torch.Tensor, idx: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """ Performs weight linear interpolation on 3 features input: features: (b, c, m) features descriptors to be interpolated from idx: (b, n, 3) three nearest neighbors of the target features in features weight: (b, n, 3) weights output: (b, c, n) tensor of the interpolated features """ assert features.is_contiguous() assert idx.is_contiguous() assert weight.is_contiguous() b, c, m = features.size() n = idx.size(1) ctx.interpolation_for_backward = (idx, weight, m) output = torch.cuda.FloatTensor(b, c, n) pointops_cuda.interpolation_forward_cuda(b, c, m, n, features, idx, weight, output) return output @staticmethod def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ input: grad_out: (b, c, n) output: grad_features: (b, c, m), None, None """ idx, weight, m = ctx.interpolation_for_backward b, c, n = grad_out.size() grad_features = torch.cuda.FloatTensor(b, c, m).zero_() grad_out_data = grad_out.data.contiguous() pointops_cuda.interpolation_backward_cuda(b, c, n, m, grad_out_data, idx, weight, grad_features.data) return grad_features, None, None interpolation = Interpolation.apply class Grouping(Function): @staticmethod def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: """ input: features: (b, c, n), idx : (b, m, nsample) containing the indicies of features to group with output: (b, c, m, nsample) """ assert features.is_contiguous() assert idx.is_contiguous() b, c, n = features.size() _, m, nsample = idx.size() output = torch.cuda.FloatTensor(b, c, m, nsample) pointops_cuda.grouping_forward_cuda(b, c, n, m, nsample, features, idx, output) ctx.for_backwards = (idx, n) return output @staticmethod def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ input: grad_out: (b, c, m, nsample) output: (b, c, n), None """ idx, n = ctx.for_backwards b, c, m, nsample = grad_out.size() grad_features = torch.cuda.FloatTensor(b, c, n).zero_() grad_out_data = grad_out.data.contiguous() pointops_cuda.grouping_backward_cuda(b, c, n, m, nsample, grad_out_data, idx, grad_features.data) return grad_features, None grouping = Grouping.apply class GroupingInt(Function): @staticmethod def forward(ctx, features: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: """ input: features: (b, c, n), idx : (b, m, nsample) containing the indicies of features to group with output: (b, c, m, nsample) """ assert features.is_contiguous() assert idx.is_contiguous() b, c, n = features.size() _, m, nsample = idx.size() output = torch.cuda.LongTensor(b, c, m, nsample) pointops_cuda.grouping_int_forward_cuda(b, c, n, m, nsample, features, idx, output) return output @staticmethod def backward(ctx, a=None): return None, None grouping_int = GroupingInt.apply class BallQuery(Function): @staticmethod def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor) -> torch.Tensor: """ input: radius: float, radius of the balls nsample: int, maximum number of features in the balls xyz: torch.Tensor, (b, n, 3) xyz coordinates of the features new_xyz: torch.Tensor, (b, m, 3) centers of the ball query output: (b, m, nsample) tensor with the indicies of the features that form the query balls """ assert xyz.is_contiguous() assert new_xyz.is_contiguous() b, n, _ = xyz.size() m = new_xyz.size(1) idx = torch.cuda.IntTensor(b, m, nsample).zero_() pointops_cuda.ballquery_cuda(b, n, m, radius, nsample, new_xyz, xyz, idx) return idx @staticmethod def backward(ctx, a=None): return None, None, None, None ballquery = BallQuery.apply class FeatureDistribute(Function): @staticmethod def forward(ctx, max_xyz: torch.Tensor, xyz: torch.Tensor) -> torch.Tensor: """ :param ctx: :param max_xyz: (b, n, 3) :param xyz: (b, m, 3) :return: distribute_idx: (b, m) """ assert max_xyz.is_contiguous() assert xyz.is_contiguous() b, n, _ = max_xyz.size() m = xyz.size(1) distribute_idx = torch.cuda.IntTensor(b, m).zero_() pointops_cuda.featuredistribute_cuda(b, n, m, max_xyz, xyz, distribute_idx) return distribute_idx @staticmethod def backward(ctx, a=None): return None, None featuredistribute = FeatureDistribute.apply class FeatureGather(Function): @staticmethod def forward(ctx, max_feature: torch.Tensor, distribute_idx: torch.Tensor) -> torch.Tensor: ''' :param ctx: :param max_feature: (b, c, n) :param distribute_idx: (b, m) :return: distribute_feature: (b, c, m) ''' assert max_feature.is_contiguous() assert distribute_idx.is_contiguous() b, c, n = max_feature.size() m = distribute_idx.size(1) distribute_feature = torch.cuda.FloatTensor(b, c, m).zero_() pointops_cuda.featuregather_forward_cuda(b, n, m, c, max_feature, distribute_idx, distribute_feature) ctx.for_backwards = (distribute_idx, n) return distribute_feature @staticmethod def backward(ctx, grad_distribute_feature: torch.Tensor): ''' :param ctx: :param grad_distribute_feature: (b, c, m) :return: grad_max_feature: (b, c, n), None ''' distribute_idx, n = ctx.for_backwards b, c, m = grad_distribute_feature.size() grad_max_feature = torch.cuda.FloatTensor(b, c, n).zero_() grad_distribute_feature_data = grad_distribute_feature.data.contiguous() pointops_cuda.featuregather_backward_cuda(b, n, m, c, grad_distribute_feature_data, distribute_idx, grad_max_feature.data) return grad_max_feature, None featuregather = FeatureGather.apply class LabelStatBallRange(Function): @staticmethod def forward(ctx, radius: float, xyz: torch.Tensor, new_xyz: torch.Tensor, label_stat: torch.Tensor) -> torch.Tensor: ''' :param ctx: :param radius: :param xyz: (b, n, 3) :param new_xyz: (b, m, 3) :param label_stat: (b, n, nclass) :return: new_label_stat: (b, m, nclass) ''' assert xyz.is_contiguous() assert new_xyz.is_contiguous() assert label_stat.is_contiguous() b, n, nclass = label_stat.size() m = new_xyz.size(1) new_label_stat = torch.cuda.IntTensor(b, m, nclass).zero_() pointops_cuda.labelstat_ballrange_cuda(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat) return new_label_stat @staticmethod def backward(ctx, a=None): return None, None, None, None labelstat_ballrange = LabelStatBallRange.apply class LabelStatIdx(Function): @staticmethod def forward(ctx, nsample: int, label_stat: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: ''' :param ctx: :param nsample: :param label_stat: (b, n, nclass) :param idx: (b, m, nsample) :return: new_label_stat: (b, m, nclass) ''' assert label_stat.is_contiguous() assert idx.is_contiguous() b, n, nclass = label_stat.size() m = idx.size(1) new_label_stat = torch.cuda.IntTensor(b, m, nclass).zero_() pointops_cuda.labelstat_idx_cuda(b, n, m, nsample, nclass, label_stat, idx, new_label_stat) return new_label_stat @staticmethod def backward(ctx, a=None): return None, None, None labelstat_idx = LabelStatIdx.apply class LabelStatAndBallQuery(Function): @staticmethod def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor, label_stat: torch.Tensor): ''' :param ctx: :param radius: :param nsample: :param xyz: (b, n, 3) :param new_xyz: (b, m, 3) :param label_stat: (b, n, nclass) :return: new_label_stat: (b, m, nclass) idx: (b, m, nsample) ''' assert xyz.is_contiguous() assert new_xyz.is_contiguous() assert label_stat.is_contiguous() b, n, nclass = label_stat.size() m = new_xyz.size(1) new_label_stat = torch.cuda.IntTensor(b, m, nclass).zero_() idx = torch.cuda.IntTensor(b, m, nsample).zero_() pointops_cuda.labelstat_and_ballquery_cuda(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat) return new_label_stat, idx @staticmethod def backward(ctx, a=None, b=None): return None, None, None, None, None labelstat_and_ballquery = LabelStatAndBallQuery.apply def pairwise_distances(x, y=None): ''' Input: x is a Nxd matrix y is an optional Mxd matirx Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:] if y is not given then use 'y=x'. i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2 ''' x_norm = (x ** 2).sum(1).view(-1, 1) if y is not None: y_t = torch.transpose(y, 0, 1) y_norm = (y ** 2).sum(1).view(1, -1) else: y_t = torch.transpose(x, 0, 1) y_norm = x_norm.view(1, -1) dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t) import numpy as np return torch.clamp(dist, 0.0, np.inf) class KNNQueryNaive(Function): @staticmethod def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]: """ KNN Indexing input: nsample: int32, Number of neighbor xyz: (b, n, 3) coordinates of the features new_xyz: (b, m, 3) centriods output: idx: (b, m, nsample) """ if new_xyz is None: new_xyz = xyz b, m, _ = new_xyz.size() n = xyz.size(1) ''' idx = torch.zeros(b, m, nsample).int().cuda() for i in range(b): dist = pairwise_distances(new_xyz[i, :, :], xyz[i, :, :]) [_, idxs] = torch.sort(dist, dim=1) idx[i, :, :] = idxs[:, 0:nsample] ''' # ''' # new_xyz_repeat = new_xyz.repeat(1, 1, n).view(b, m * n, 3) # xyz_repeat = xyz.repeat(1, m, 1).view(b, m * n, 3) # dist = (new_xyz_repeat - xyz_repeat).pow(2).sum(dim=2).view(b, m, n) dist = (new_xyz.repeat(1, 1, n).view(b, m * n, 3) - xyz.repeat(1, m, 1).view(b, m * n, 3)).pow(2).sum(dim=2).view(b, m, n) [_, idxs] = torch.sort(dist, dim=2) idx = idxs[:, :, 0:nsample].int() # ''' return idx @staticmethod def backward(ctx): return None, None, None knnquery_naive = KNNQueryNaive.apply class KNNQuery(Function): @staticmethod def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]: """ KNN Indexing input: nsample: int32, Number of neighbor xyz: (b, n, 3) coordinates of the features new_xyz: (b, m, 3) centriods output: idx: (b, m, nsample) ( dist2: (b, m, nsample) ) """ if new_xyz is None: new_xyz = xyz assert xyz.is_contiguous() assert new_xyz.is_contiguous() b, m, _ = new_xyz.size() n = xyz.size(1) idx = torch.cuda.IntTensor(b, m, nsample).zero_() dist2 = torch.cuda.FloatTensor(b, m, nsample).zero_() pointops_cuda.knnquery_cuda(b, n, m, nsample, xyz, new_xyz, idx, dist2) return idx @staticmethod def backward(ctx, a=None): return None, None, None knnquery = KNNQuery.apply class KNNQueryExclude(Function): @staticmethod def forward(ctx, nsample: int, xyz: torch.Tensor, new_xyz: torch.Tensor = None) -> Tuple[torch.Tensor]: """ KNN Indexing input: nsample: int32, Number of neighbor xyz: (b, n, 3) coordinates of the features new_xyz: (b, m, 3) centriods output: new_features: (b, m, nsample) """ if new_xyz is None: new_xyz = xyz b, m, _ = new_xyz.size() n = xyz.size(1) ''' idx = torch.zeros(b, m, nsample).int().cuda() for i in range(b): dist = pairwise_distances(new_xyz[i, :, :], xyz[i, :, :]) [_, idxs] = torch.sort(dist, dim=1) idx[i, :, :] = idxs[:, 0:nsample] ''' # ''' # new_xyz_repeat = new_xyz.repeat(1, 1, n).view(b, m * n, 3) # xyz_repeat = xyz.repeat(1, m, 1).view(b, m * n, 3) # dist = (new_xyz_repeat - xyz_repeat).pow(2).sum(dim=2).view(b, m, n) dist = (new_xyz.repeat(1, 1, n).view(b, m * n, 3) - xyz.repeat(1, m, 1).view(b, m * n, 3)).pow(2).sum(dim=2).view(b, m, n) [_, idxs] = torch.sort(dist, dim=2) idx = idxs[:, :, 1:nsample+1].int() # ''' return idx @staticmethod def backward(ctx): return None, None, None knnquery_exclude = KNNQueryExclude.apply class QueryAndGroup(nn.Module): """ Groups with a ball query of radius parameters: radius: float32, Radius of ball nsample: int32, Maximum number of features to gather in the ball """ def __init__(self, radius=None, nsample=32, use_xyz=True): super(QueryAndGroup, self).__init__() self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor = None, features: torch.Tensor = None, idx: torch.Tensor = None) -> torch.Tensor: """ input: xyz: (b, n, 3) coordinates of the features new_xyz: (b, m, 3) centriods features: (b, c, n) idx: idx of neighbors # idxs: (b, n) output: new_features: (b, c+3, m, nsample) # grouped_idxs: (b, m, nsample) """ if new_xyz is None: new_xyz = xyz if idx is None: if self.radius is not None: idx = ballquery(self.radius, self.nsample, xyz, new_xyz) else: # idx = knnquery_naive(self.nsample, xyz, new_xyz) # (b, m, nsample) idx = knnquery(self.nsample, xyz, new_xyz) # (b, m, nsample) xyz_trans = xyz.transpose(1, 2).contiguous() grouped_xyz = grouping(xyz_trans, idx) # (b, 3, m, nsample) # grouped_idxs = grouping(idxs.unsqueeze(1).float(), idx).squeeze(1).int() # (b, m, nsample) grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) if features is not None: grouped_features = grouping(features, idx) if self.use_xyz: new_features = torch.cat([grouped_xyz, grouped_features], dim=1) # (b, c+3, m, nsample) else: new_features = grouped_features else: assert self.use_xyz, "Cannot have not features and not use xyz as a feature!" new_features = grouped_xyz return new_features class GroupAll(nn.Module): """ Groups all features """ def __init__(self, use_xyz: bool = True): super(GroupAll, self).__init__() self.use_xyz = use_xyz def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None) -> Tuple[torch.Tensor]: """ input: xyz: (b, n, 3) coordinates of the features new_xyz: ignored torch features: (b, c, n) descriptors of the features output: new_features: (b, c+3, 1, N) tensor """ grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) if features is not None: grouped_features = features.unsqueeze(2) if self.use_xyz: new_features = torch.cat([grouped_xyz, grouped_features], dim=1) # (b, c+3, 1, n) else: new_features = grouped_features else: new_features = grouped_xyz return new_features ================================================ FILE: lib/pointops/setup.py ================================================ #python3 setup.py install from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name='pointops', ext_modules=[ CUDAExtension('pointops_cuda', [ 'src/pointops_api.cpp', 'src/ballquery/ballquery_cuda.cpp', 'src/ballquery/ballquery_cuda_kernel.cu', 'src/knnquery/knnquery_cuda.cpp', 'src/knnquery/knnquery_cuda_kernel.cu', 'src/grouping/grouping_cuda.cpp', 'src/grouping/grouping_cuda_kernel.cu', 'src/grouping_int/grouping_int_cuda.cpp', 'src/grouping_int/grouping_int_cuda_kernel.cu', 'src/interpolation/interpolation_cuda.cpp', 'src/interpolation/interpolation_cuda_kernel.cu', 'src/sampling/sampling_cuda.cpp', 'src/sampling/sampling_cuda_kernel.cu', 'src/labelstat/labelstat_cuda.cpp', 'src/labelstat/labelstat_cuda_kernel.cu', 'src/featuredistribute/featuredistribute_cuda.cpp', 'src/featuredistribute/featuredistribute_cuda_kernel.cu' ], extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}) ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: lib/pointops/src/__init__.py ================================================ ================================================ FILE: lib/pointops/src/ballquery/ballquery_cuda.cpp ================================================ #include #include #include #include #include "ballquery_cuda_kernel.h" extern THCState *state; #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) { const float *new_xyz = new_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); int *idx = idx_tensor.data(); ballquery_cuda_launcher(b, n, m, radius, nsample, new_xyz, xyz, idx); } void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) { CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(xyz_tensor); const float *new_xyz = new_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); int *idx = idx_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); ballquery_cuda_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx, stream); } ================================================ FILE: lib/pointops/src/ballquery/ballquery_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "ballquery_cuda_kernel.h" // input: new_xyz(b, m, 3) xyz(b, n, 3) // output: idx(b, m, nsample) __global__ void ballquery_cuda_kernel(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx) { int batch_index = blockIdx.x; xyz += batch_index * n * 3; new_xyz += batch_index * m * 3; idx += m * nsample * batch_index; int index = threadIdx.x; int stride = blockDim.x; float radius2 = radius * radius; for (int j = index; j < m; j += stride) { float new_x = new_xyz[j * 3 + 0]; float new_y = new_xyz[j * 3 + 1]; float new_z = new_xyz[j * 3 + 2]; for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) idx[j * nsample + l] = k; } idx[j * nsample + cnt] = k; ++cnt; } } } } void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx) { ballquery_cuda_kernel<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx); } __global__ void ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; float radius2 = radius * radius; float new_x = new_xyz[0]; float new_y = new_xyz[1]; float new_z = new_xyz[2]; int cnt = 0; for (int k = 0; k < n; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2){ if (cnt == 0){ for (int l = 0; l < nsample; ++l) { idx[l] = k; } } idx[cnt] = k; ++cnt; if (cnt >= nsample){ break; } } } } void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream) { // param new_xyz: (B, m, 3) // param xyz: (B, n, 3) // param idx: (B, m, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); ballquery_cuda_kernel_fast<<>>(b, n, m, radius, nsample, new_xyz, xyz, idx); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/ballquery/ballquery_cuda_kernel.h ================================================ #ifndef _BALLQUERY_CUDA_KERNEL #define _BALLQUERY_CUDA_KERNEL #include #include #include void ballquery_cuda(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); void ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor); #ifdef __cplusplus extern "C" { #endif void ballquery_cuda_launcher(int b, int n, int m, float radius, int nsample, const float *xyz, const float *new_xyz, int *idx); void ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx, cudaStream_t stream); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/cuda_utils.h ================================================ #ifndef _CUDA_UTILS_H #define _CUDA_UTILS_H #include #define TOTAL_THREADS 1024 #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) #define THREADS_PER_BLOCK 256 #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) inline int opt_n_threads(int work_size) { const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); return max(min(1 << pow_2, TOTAL_THREADS), 1); } inline dim3 opt_block_config(int x, int y) { const int x_threads = opt_n_threads(x); const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); dim3 block_config(x_threads, y_threads, 1); return block_config; } #endif ================================================ FILE: lib/pointops/src/featuredistribute/featuredistribute_cuda.cpp ================================================ #include #include #include #include #include "featuredistribute_cuda_kernel.h" extern THCState *state; #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor) { CHECK_INPUT(max_xyz_tensor); CHECK_INPUT(xyz_tensor); const float *max_xyz = max_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); int *distribute_idx = distribute_idx_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); featuredistribute_cuda_launcher(b, n, m, max_xyz, xyz, distribute_idx, stream); } void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor) { CHECK_INPUT(max_feature_tensor); CHECK_INPUT(distribute_idx_tensor); const float *max_feature = max_feature_tensor.data(); const int *distribute_idx = distribute_idx_tensor.data(); float *distribute_feature = distribute_feature_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); featuregather_forward_cuda_launcher(b, n, m, c, max_feature, distribute_idx, distribute_feature, stream); } void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor) { CHECK_INPUT(grad_distribute_feature_tensor); CHECK_INPUT(distribute_idx_tensor); const float *grad_distribute_feature = grad_distribute_feature_tensor.data(); const int *distribute_idx = distribute_idx_tensor.data(); float *grad_max_feature = grad_max_feature_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); featuregather_backward_cuda_launcher(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature, stream); } ================================================ FILE: lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "featuredistribute_cuda_kernel.h" __global__ void featuredistribute_cuda_kernel(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; max_xyz += bs_idx * n * 3; xyz += bs_idx * m * 3 + pt_idx * 3; distribute_idx += bs_idx * m + pt_idx; float x = xyz[0]; float y = xyz[1]; float z = xyz[2]; float min_dist2 = 100000; int min_dist_idx = -1; for (int k = 0; k < n; ++k) { float max_x = max_xyz[k * 3 + 0]; float max_y = max_xyz[k * 3 + 1]; float max_z = max_xyz[k * 3 + 2]; float d2 = (max_x - x) * (max_x - x) + (max_y - y) * (max_y - y) + (max_z - z) * (max_z - z); if (d2 < min_dist2){ min_dist_idx = k; min_dist2 = d2; } } distribute_idx[0] = min_dist_idx; } void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream) { // param max_xyz: (b, n, 3) // param xyz: (b, m, 3) // return distribute_idx: (b, m) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); featuredistribute_cuda_kernel<<>>(b, n, m, max_xyz, xyz, distribute_idx); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } __global__ void featuregather_forward_cuda_kernel(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature) { int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; max_feature += bs_idx * c * n + c_idx * n; distribute_idx += bs_idx * m + pt_idx; distribute_feature += bs_idx * c * m + c_idx * m + pt_idx; int idx = distribute_idx[0]; distribute_feature[0] = max_feature[idx]; } void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream){ // param max_feature: (b, c, n) // param distribute_idx: (b, m) // return distribute_feature: (b, c, m) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); featuregather_forward_cuda_kernel<<>>(b, n, m, c, max_feature, distribute_idx, distribute_feature); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } __global__ void featuregather_backward_cuda_kernel(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature){ int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if(bs_idx >= b || c_idx >= c || pt_idx >= m) return; grad_distribute_feature += bs_idx * c * m + c_idx * m + pt_idx; distribute_idx += bs_idx * m + pt_idx; grad_max_feature += bs_idx * c * n + c_idx * n; int idx = distribute_idx[0]; atomicAdd(grad_max_feature + idx, grad_distribute_feature[0]); } void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream){ // param grad_distribute_feature: (b, c, m) // param distribute_idx: (b, m) // return grad_max_feature: (b, c, n) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); featuregather_backward_cuda_kernel<<>>(b, n, m, c, grad_distribute_feature, distribute_idx, grad_max_feature); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/featuredistribute/featuredistribute_cuda_kernel.h ================================================ #ifndef _FEATUREDISTRIBUTE_CUDA_KERNEL #define _FEATUREDISTRIBUTE_CUDA_KERNEL #include #include #include void featuredistribute_cuda(int b, int n, int m, at::Tensor max_xyz_tensor, at::Tensor xyz_tensor, at::Tensor distribute_idx_tensor); void featuregather_forward_cuda(int b, int n, int m, int c, at::Tensor max_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor distribute_feature_tensor); void featuregather_backward_cuda(int b, int n, int m, int c, at::Tensor grad_distribute_feature_tensor, at::Tensor distribute_idx_tensor, at::Tensor grad_max_feature_tensor); #ifdef __cplusplus extern "C" { #endif void featuredistribute_cuda_launcher(int b, int n, int m, const float *max_xyz, const float *xyz, int *distribute_idx, cudaStream_t stream); void featuregather_forward_cuda_launcher(int b, int n, int m, int c, const float *max_feature, const int *distribute_idx, float *distribute_feature, cudaStream_t stream); void featuregather_backward_cuda_launcher(int b, int n, int m, int c, const float *grad_distribute_feature, const int *distribute_idx, float *grad_max_feature, cudaStream_t stream); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/grouping/grouping_cuda.cpp ================================================ #include #include #include #include #include "grouping_cuda_kernel.h" extern THCState *state; void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data(); const int *idx = idx_tensor.data(); float *out = out_tensor.data(); grouping_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out); } void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) { float *grad_points = grad_points_tensor.data(); const int *idx = idx_tensor.data(); const float *grad_out = grad_out_tensor.data(); grouping_backward_cuda_launcher(b, c, n, m, nsample, grad_out, idx, grad_points); } void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data(); const int *idx = idx_tensor.data(); float *out = out_tensor.data(); grouping_forward_cuda_launcher_fast(b, c, n, npoints, nsample, points, idx, out); } ================================================ FILE: lib/pointops/src/grouping/grouping_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "grouping_cuda_kernel.h" // input: points(b, c, n) idx(b, m, nsample) // output: out(b, c, m, nsample) __global__ void grouping_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out) { int batch_index = blockIdx.x; points += batch_index * n * c; idx += batch_index * m * nsample; out += batch_index * m * nsample * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * m; i += stride) { const int l = i / m; const int j = i % m; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; out[(l * m + j) * nsample + k] = points[l * n + ii]; } } } // input: grad_out(b, c, m, nsample), idx(b, m, nsample) // output: grad_points(b, c, n) __global__ void grouping_backward_cuda_kernel(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points) { int batch_index = blockIdx.x; grad_out += batch_index * m * nsample * c; idx += batch_index * m * nsample; grad_points += batch_index * n * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * m; i += stride) { const int l = i / m; const int j = i % m; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; atomicAdd(grad_points + l * n + ii, grad_out[(l * m + j) * nsample + k]); } } } void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out) { grouping_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out); } void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points) { grouping_backward_cuda_kernel<<>>(b, c, n, m, nsample, grad_out, idx, grad_points); } // input: points(b, c, n) idx(b, npoints, nsample) // output: out(b, c, npoints, nsample) __global__ void grouping_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int index = blockIdx.x * blockDim.x + threadIdx.x; int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx]; } // input: points(b, c, n) idx(b, npoints, nsample) // output: out(b, c, npoints, nsample) void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out) { cudaError_t err; dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); grouping_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/grouping/grouping_cuda_kernel.h ================================================ #ifndef _GROUPING_CUDA_KERNEL #define _GROUPING_CUDA_KERNEL #include #include #include void grouping_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out); void grouping_backward_cuda(int b, int c, int n, int m, int nsample, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); void grouping_forward_cuda_fast(int b, int c, int n, int npoints, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); #ifdef __cplusplus extern "C" { #endif void grouping_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *points, const int *idx, float *out); void grouping_backward_cuda_launcher(int b, int c, int n, int m, int nsample, const float *grad_out, const int *idx, float *grad_points); void grouping_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/grouping_int/grouping_int_cuda.cpp ================================================ #include #include #include #include #include "grouping_int_cuda_kernel.h" extern THCState *state; void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const long int *points = points_tensor.data(); const int *idx = idx_tensor.data(); long int *out = out_tensor.data(); grouping_int_forward_cuda_launcher(b, c, n, m, nsample, points, idx, out); } void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const long int *points = points_tensor.data(); const int *idx = idx_tensor.data(); long int *out = out_tensor.data(); grouping_int_forward_cuda_launcher_fast(b, c, n, m, nsample, points, idx, out); } ================================================ FILE: lib/pointops/src/grouping_int/grouping_int_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "grouping_int_cuda_kernel.h" // input: points(b, c, n) idx(b, m, nsample) // output: out(b, c, m, nsample) __global__ void grouping_int_forward_cuda_kernel(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out) { int batch_index = blockIdx.x; points += batch_index * n * c; idx += batch_index * m * nsample; out += batch_index * m * nsample * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * m; i += stride) { const int l = i / m; const int j = i % m; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; out[(l * m + j) * nsample + k] = points[l * n + ii]; } } } void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out) { grouping_int_forward_cuda_kernel<<>>(b, c, n, m, nsample, points, idx, out); } __global__ void grouping_int_forward_cuda_kernel_fast(int b, int c, int n, int npoints, int nsample, const long int *__restrict__ points, const int *__restrict__ idx, long int *__restrict__ out) { int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int index = blockIdx.x * blockDim.x + threadIdx.x; int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx]; } void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out) { cudaError_t err; dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); grouping_int_forward_cuda_kernel_fast<<>>(b, c, n, npoints, nsample, points, idx, out); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/grouping_int/grouping_int_cuda_kernel.h ================================================ #ifndef _GROUPING_INT_CUDA_KERNEL #define _GROUPING_INT_CUDA_KERNEL #include #include #include void grouping_int_forward_cuda(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out); void grouping_int_forward_cuda_fast(int b, int c, int n, int m, int nsample, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); #ifdef __cplusplus extern "C" { #endif void grouping_int_forward_cuda_launcher(int b, int c, int n, int m, int nsample, const long int *points, const int *idx, long int *out); void grouping_int_forward_cuda_launcher_fast(int b, int c, int n, int npoints, int nsample, const long int *points, const int *idx, long int *out); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/interpolation/interpolation_cuda.cpp ================================================ #include #include #include #include #include "interpolation_cuda_kernel.h" extern THCState *state; void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) { const float *unknown = unknown_tensor.data(); const float *known = known_tensor.data(); float *dist2 = dist2_tensor.data(); int *idx = idx_tensor.data(); nearestneighbor_cuda_launcher(b, n, m, unknown, known, dist2, idx); } void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data(); const float *weight = weight_tensor.data(); float *out = out_tensor.data(); const int *idx = idx_tensor.data(); interpolation_forward_cuda_launcher(b, c, m, n, points, idx, weight, out); } void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor) { const float *grad_out = grad_out_tensor.data(); const float *weight = weight_tensor.data(); float *grad_points = grad_points_tensor.data(); const int *idx = idx_tensor.data(); interpolation_backward_cuda_launcher(b, c, n, m, grad_out, idx, weight, grad_points); } void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) { const float *unknown = unknown_tensor.data(); const float *known = known_tensor.data(); float *dist2 = dist2_tensor.data(); int *idx = idx_tensor.data(); nearestneighbor_cuda_launcher_fast(b, n, m, unknown, known, dist2, idx); } void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data(); const float *weight = weight_tensor.data(); float *out = out_tensor.data(); const int *idx = idx_tensor.data(); interpolation_forward_cuda_launcher_fast(b, c, m, n, points, idx, weight, out); } ================================================ FILE: lib/pointops/src/interpolation/interpolation_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "interpolation_cuda_kernel.h" // input: unknown(b, n, 3) known(b, m, 3) // output: dist2(b, n, 3), idx(b, n, 3) __global__ void nearestneighbor_cuda_kernel(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx) { int batch_index = blockIdx.x; unknown += batch_index * n * 3; known += batch_index * m * 3; dist2 += batch_index * n * 3; idx += batch_index * n * 3; int index = threadIdx.x; int stride = blockDim.x; for (int j = index; j < n; j += stride) { float ux = unknown[j * 3 + 0]; float uy = unknown[j * 3 + 1]; float uz = unknown[j * 3 + 2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[j * 3 + 0] = best1; dist2[j * 3 + 1] = best2; dist2[j * 3 + 2] = best3; idx[j * 3 + 0] = besti1; idx[j * 3 + 1] = besti2; idx[j * 3 + 2] = besti3; } } // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) // output: out(b, c, n) __global__ void interpolation_forward_cuda_kernel(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out) { int batch_index = blockIdx.x; points += batch_index * m * c; idx += batch_index * n * 3; weight += batch_index * n * 3; out += batch_index * n * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * n; i += stride) { const int l = i / n; const int j = i % n; float w1 = weight[j * 3 + 0]; float w2 = weight[j * 3 + 1]; float w3 = weight[j * 3 + 2]; int i1 = idx[j * 3 + 0]; int i2 = idx[j * 3 + 1]; int i3 = idx[j * 3 + 2]; out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + points[l * m + i3] * w3; } } // input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) // output: grad_points(b, c, m) __global__ void interpolation_backward_cuda_kernel( int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points) { int batch_index = blockIdx.x; grad_out += batch_index * n * c; idx += batch_index * n * 3; weight += batch_index * n * 3; grad_points += batch_index * m * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * n; i += stride) { const int l = i / n; const int j = i % n; float w1 = weight[j * 3 + 0]; float w2 = weight[j * 3 + 1]; float w3 = weight[j * 3 + 2]; int i1 = idx[j * 3 + 0]; int i2 = idx[j * 3 + 1]; int i3 = idx[j * 3 + 2]; atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); } } void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx) { nearestneighbor_cuda_kernel<<>>(b, n, m, unknown, known, dist2, idx); } void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out) { interpolation_forward_cuda_kernel<<>>(b, c, m, n, points, idx, weight, out); } void interpolation_backward_cuda_launcher(int b, int n, int c, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points) { interpolation_backward_cuda_kernel<<>>(b, n, c, m, grad_out, idx, weight, grad_points); } // input: unknown(b, n, 3) known(b, m, 3) // output: dist2(b, n, 3), idx(b, n, 3) __global__ void nearestneighbor_cuda_kernel_fast(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= n) return; unknown += bs_idx * n * 3 + pt_idx * 3; known += bs_idx * m * 3; dist2 += bs_idx * n * 3 + pt_idx * 3; idx += bs_idx * n * 3 + pt_idx * 3; float ux = unknown[0]; float uy = unknown[1]; float uz = unknown[2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; } // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) // output: out(b, c, n) __global__ void interpolation_forward_cuda_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) { int bs_idx = blockIdx.z; int c_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; weight += bs_idx * n * 3 + pt_idx * 3; points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; out += bs_idx * c * n + c_idx * n; out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; } void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx) { cudaError_t err; dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); nearestneighbor_cuda_kernel_fast<<>>(b, n, m, unknown, known, dist2, idx); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out) { cudaError_t err; dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); interpolation_forward_cuda_kernel_fast<<>>(b, c, m, n, points, idx, weight, out); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/interpolation/interpolation_cuda_kernel.h ================================================ #ifndef _INTERPOLATION_CUDA_KERNEL #define _INTERPOLATION_CUDA_KERNEL #include #include #include void nearestneighbor_cuda(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); void interpolation_forward_cuda(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); void interpolation_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor); void nearestneighbor_cuda_fast(int b, int n, int m, at::Tensor unknown_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor); void interpolation_forward_cuda_fast(int b, int c, int m, int n, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor); #ifdef __cplusplus extern "C" { #endif void nearestneighbor_cuda_launcher(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx); void interpolation_forward_cuda_launcher(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out); void interpolation_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points); void nearestneighbor_cuda_launcher_fast(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx); void interpolation_forward_cuda_launcher_fast(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/knnquery/__init__.py ================================================ ================================================ FILE: lib/pointops/src/knnquery/knnquery_cuda.cpp ================================================ #include #include #include #include #include "knnquery_cuda_kernel.h" extern THCState *state; #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) { CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(xyz_tensor); const float *new_xyz = new_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); int *idx = idx_tensor.data(); float *dist2 = dist2_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); knnquery_cuda_launcher(b, n, m, nsample, xyz, new_xyz, idx, dist2, stream); } ================================================ FILE: lib/pointops/src/knnquery/knnquery_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "knnquery_cuda_kernel.h" // input: xyz (b, n, 3) new_xyz (b, m, 3) // output: idx (b, m, nsample) dist2 (b, m, nsample) __global__ void knnquery_cuda_kernel(int b, int n, int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, int *__restrict__ idx, float *__restrict__ dist2) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; float new_x = new_xyz[0]; float new_y = new_xyz[1]; float new_z = new_xyz[2]; //double* best = new double[nsample]; //int* besti = new int[nsample]; double best[200]; int besti[200]; for(int i = 0; i < nsample; i++){ best[i] = 1e40; besti[i] = 0; } for(int k = 0; k < n; k++){ float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); for(int j = 0; j < nsample; j++){ if(d2 < best[j]){ for(int i = nsample - 1; i > j; i--){ best[i] = best[i - 1]; besti[i] = besti[i - 1]; } best[j] = d2; besti[j] = k; break; } } } for(int i = 0; i < nsample; i++){ idx[i] = besti[i]; dist2[i] = best[i]; } //delete []best; //delete []besti; } void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream) { // param new_xyz: (B, m, 3) // param xyz: (B, n, 3) // param idx: (B, m, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); knnquery_cuda_kernel<<>>(b, n, m, nsample, xyz, new_xyz, idx, dist2); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/knnquery/knnquery_cuda_kernel.h ================================================ #ifndef _KNNQUERY_CUDA_KERNEL #define _KNNQUERY_CUDA_KERNEL #include #include #include void knnquery_cuda(int b, int n, int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); #ifdef __cplusplus extern "C" { #endif void knnquery_cuda_launcher(int b, int n, int m, int nsample, const float *xyz, const float *new_xyz, int *idx, float *dist2, cudaStream_t stream); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/labelstat/labelstat_cuda.cpp ================================================ #include #include #include #include #include "labelstat_cuda_kernel.h" extern THCState *state; #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor) { CHECK_INPUT(label_stat_tensor); CHECK_INPUT(idx_tensor); const int *label_stat = label_stat_tensor.data(); const int *idx = idx_tensor.data(); int *new_label_stat = new_label_stat_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); labelstat_idx_cuda_launcher_fast(b, n, m, nsample, nclass, label_stat, idx, new_label_stat, stream); } void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor) { CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(xyz_tensor); CHECK_INPUT(label_stat_tensor); const float *new_xyz = new_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); const int *label_stat = label_stat_tensor.data(); int *new_label_stat = new_label_stat_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); labelstat_ballrange_cuda_launcher_fast(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat, stream); } void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor) { CHECK_INPUT(new_xyz_tensor); CHECK_INPUT(xyz_tensor); CHECK_INPUT(label_stat_tensor); CHECK_INPUT(idx_tensor); const float *new_xyz = new_xyz_tensor.data(); const float *xyz = xyz_tensor.data(); const int *label_stat = label_stat_tensor.data(); int *idx = idx_tensor.data(); int *new_label_stat = new_label_stat_tensor.data(); cudaStream_t stream = THCState_getCurrentStream(state); labelstat_and_ballquery_cuda_launcher_fast(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat, stream); } ================================================ FILE: lib/pointops/src/labelstat/labelstat_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "labelstat_cuda_kernel.h" // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass) // output: idx(b, m, nsample) new_label_stat(b, m, nclass) __global__ void labelstat_and_ballquery_cuda_kernel_fast(int b, int n, int m, float radius, int nsample, int nclass, const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; label_stat += bs_idx * n * nclass; new_label_stat += bs_idx * m * nclass + pt_idx * nclass; for(int i = 0; i < nclass; i++){ new_label_stat[i] = 0; } float radius2 = radius * radius; float new_x = new_xyz[0]; float new_y = new_xyz[1]; float new_z = new_xyz[2]; int cnt = 0; for (int k = 0; k < n; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2){ for(int i = 0; i < nclass; i++){ new_label_stat[i] += label_stat[k * nclass + i]; } if (cnt == 0){ for (int l = 0; l < nsample; ++l) { idx[l] = k; } } idx[cnt] = k; ++cnt; if (cnt >= nsample){ break; } } } } void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass, const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream) { // param new_xyz: (B, m, 3) // param xyz: (B, n, 3) // param idx: (B, m, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); labelstat_and_ballquery_cuda_kernel_fast<<>>(b, n, m, radius, nsample, nclass, new_xyz, xyz, label_stat, idx, new_label_stat); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } // input: new_xyz(b, m, 3) xyz(b, n, 3) label_stat(b, n, nclass) // output: new_label_stat(b, m, nclass) __global__ void labelstat_ballrange_cuda_kernel_fast(int b, int n, int m, float radius, int nclass, const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; label_stat += bs_idx * n * nclass; new_label_stat += bs_idx * m * nclass + pt_idx * nclass; for(int i = 0; i < nclass; i++){ new_label_stat[i] = 0; } float radius2 = radius * radius; float new_x = new_xyz[0]; float new_y = new_xyz[1]; float new_z = new_xyz[2]; for (int k = 0; k < n; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2){ for(int i = 0; i < nclass; i++){ new_label_stat[i] += label_stat[k * nclass + i]; } } } } void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass, const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream) { // param new_xyz: (B, m, 3) // param xyz: (B, n, 3) // param idx: (B, m, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); labelstat_ballrange_cuda_kernel_fast<<>>(b, n, m, radius, nclass, new_xyz, xyz, label_stat, new_label_stat); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } // input: idx(b, m, nsample) label_stat(b, n, nclass) // output: new_label_stat(b, m, nclass) __global__ void labelstat_idx_cuda_kernel_fast(int b, int n, int m, int nsample, int nclass, const int *label_stat, const int *idx, int *new_label_stat) { int bs_idx = blockIdx.y; int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; if (bs_idx >= b || pt_idx >= m) return; idx += bs_idx * m * nsample + pt_idx * nsample; label_stat += bs_idx * n * nclass; new_label_stat += bs_idx * m * nclass + pt_idx * nclass; for(int i = 0; i < nclass; i++){ new_label_stat[i] = 0; } for(int k = 0; k < nsample; k++){ const int *label_stat_k = label_stat + idx[k] * nclass; for(int i = 0; i < nclass; i++){ new_label_stat[i] += label_stat_k[i]; } } } void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass, const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream) { // param new_xyz: (B, m, 3) // param xyz: (B, n, 3) // param idx: (B, m, nsample) cudaError_t err; dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row) dim3 threads(THREADS_PER_BLOCK); labelstat_idx_cuda_kernel_fast<<>>(b, n, m, nsample, nclass, label_stat, idx, new_label_stat); // cudaDeviceSynchronize(); // for using printf in kernel function err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } ================================================ FILE: lib/pointops/src/labelstat/labelstat_cuda_kernel.h ================================================ #ifndef _LABELSTAT_CUDA_KERNEL #define _LABELSTAT_CUDA_KERNEL #include #include #include void labelstat_and_ballquery_cuda_fast(int b, int n, int m, float radius, int nsample, int nclass, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor); void labelstat_ballrange_cuda_fast(int b, int n, int m, float radius, int nclass, at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor label_stat_tensor, at::Tensor new_label_stat_tensor); void labelstat_idx_cuda_fast(int b, int n, int m, int nsample, int nclass, at::Tensor label_stat_tensor, at::Tensor idx_tensor, at::Tensor new_label_stat_tensor); #ifdef __cplusplus extern "C" { #endif void labelstat_and_ballquery_cuda_launcher_fast(int b, int n, int m, float radius, int nsample, int nclass, \ const float *new_xyz, const float *xyz, const int *label_stat, int *idx, int *new_label_stat, cudaStream_t stream); void labelstat_ballrange_cuda_launcher_fast(int b, int n, int m, float radius, int nclass, \ const float *new_xyz, const float *xyz, const int *label_stat, int *new_label_stat, cudaStream_t stream); void labelstat_idx_cuda_launcher_fast(int b, int n, int m, int nsample, int nclass, \ const int *label_stat, const int *idx, int *new_label_stat, cudaStream_t stream); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/pointops/src/pointops_api.cpp ================================================ #include #include #include "ballquery/ballquery_cuda_kernel.h" #include "grouping/grouping_cuda_kernel.h" #include "grouping_int/grouping_int_cuda_kernel.h" #include "sampling/sampling_cuda_kernel.h" #include "interpolation/interpolation_cuda_kernel.h" #include "knnquery/knnquery_cuda_kernel.h" #include "labelstat/labelstat_cuda_kernel.h" #include "featuredistribute/featuredistribute_cuda_kernel.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ballquery_cuda", &ballquery_cuda_fast, "ballquery_cuda_fast"); // name in python, cpp function address, docs m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda"); m.def("grouping_forward_cuda", &grouping_forward_cuda_fast, "grouping_forward_cuda_fast"); m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda"); m.def("grouping_int_forward_cuda", &grouping_int_forward_cuda_fast, "grouping_int_forward_cuda_fast"); m.def("gathering_forward_cuda", &gathering_forward_cuda, "gathering_forward_cuda"); m.def("gathering_backward_cuda", &gathering_backward_cuda, "gathering_backward_cuda"); m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda"); m.def("nearestneighbor_cuda", &nearestneighbor_cuda_fast, "nearestneighbor_cuda_fast"); m.def("interpolation_forward_cuda", &interpolation_forward_cuda_fast, "interpolation_forward_cuda_fast"); m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda"); m.def("labelstat_idx_cuda", &labelstat_idx_cuda_fast, "labelstat_idx_cuda_fast"); m.def("labelstat_ballrange_cuda", &labelstat_ballrange_cuda_fast, "labelstat_ballrange_cuda_fast"); m.def("labelstat_and_ballquery_cuda", &labelstat_and_ballquery_cuda_fast, "labelstat_and_ballquery_cuda_fast"); m.def("featuredistribute_cuda", &featuredistribute_cuda, "featuredistribute_cuda"); m.def("featuregather_forward_cuda", &featuregather_forward_cuda, "featuregather_forward_cuda"); m.def("featuregather_backward_cuda", &featuregather_backward_cuda, "featuregather_backward_cuda"); } ================================================ FILE: lib/pointops/src/sampling/sampling_cuda.cpp ================================================ #include #include #include #include #include "sampling_cuda_kernel.h" extern THCState *state; void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) { const float *points = points_tensor.data(); const int *idx = idx_tensor.data(); float *out = out_tensor.data(); gathering_forward_cuda_launcher(b, c, n, m, points, idx, out); } void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) { const float *grad_out = grad_out_tensor.data(); const int *idx = idx_tensor.data(); float *grad_points = grad_points_tensor.data(); gathering_backward_cuda_launcher(b, c, n, m, grad_out, idx, grad_points); } void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) { const float *points = points_tensor.data(); float *temp = temp_tensor.data(); int *idx = idx_tensor.data(); furthestsampling_cuda_launcher(b, n, m, points, temp, idx); } ================================================ FILE: lib/pointops/src/sampling/sampling_cuda_kernel.cu ================================================ #include "../cuda_utils.h" #include "sampling_cuda_kernel.h" // input: points(b, c, n) idx(b, m) // output: out(b, c, m) __global__ void gathering_forward_cuda_kernel(int b, int c, int n, int m, const float *points, const int *idx, float *out) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int l = blockIdx.y; l < c; l += gridDim.y) { for (int j = threadIdx.x; j < m; j += blockDim.x) { int a = idx[i * m + j]; out[(i * c + l) * m + j] = points[(i * c + l) * n + a]; } } } } // input: grad_out(b, c, m) idx(b, m) // output: grad_points(b, c, n) __global__ void gathering_backward_cuda_kernel(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int l = blockIdx.y; l < c; l += gridDim.y) { for (int j = threadIdx.x; j < m; j += blockDim.x) { int a = idx[i * m + j]; atomicAdd(grad_points + (i * c + l) * n + a, grad_out[(i * c + l) * m + j]); } } } } void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out) { gathering_forward_cuda_kernel<<>>(b, c, n, m, points, idx, out); } void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points) { gathering_backward_cuda_kernel<<>>(b, c, n, m, grad_out, idx, grad_points); } __device__ void __update(float *dists, int *dists_i, int idx1, int idx2) { const float v1 = dists[idx1], v2 = dists[idx2]; const int i1 = dists_i[idx1], i2 = dists_i[idx2]; dists[idx1] = max(v1, v2); dists_i[idx1] = v2 > v1 ? i2 : i1; } // Input dataset: (b, n, 3), tmp: (b, n) // Ouput idxs (b, m) template __global__ void furthestsampling_cuda_kernel(int b, int n, int m, const float *dataset, float *temp, int *idxs) { if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * 3; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; float x1 = dataset[old * 3 + 0]; float y1 = dataset[old * 3 + 1]; float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { float x2, y2, z2; x2 = dataset[k * 3 + 0]; y2 = dataset[k * 3 + 1]; z2 = dataset[k * 3 + 2]; //float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); //if (mag <= 1e-3) // continue; float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); if (block_size >= 1024) { if (tid < 512) { __update(dists, dists_i, tid, tid + 512); } __syncthreads(); } if (block_size >= 512) { if (tid < 256) { __update(dists, dists_i, tid, tid + 256); } __syncthreads(); } if (block_size >= 256) { if (tid < 128) { __update(dists, dists_i, tid, tid + 128); } __syncthreads(); } if (block_size >= 128) { if (tid < 64) { __update(dists, dists_i, tid, tid + 64); } __syncthreads(); } if (block_size >= 64) { if (tid < 32) { __update(dists, dists_i, tid, tid + 32); } __syncthreads(); } if (block_size >= 32) { if (tid < 16) { __update(dists, dists_i, tid, tid + 16); } __syncthreads(); } if (block_size >= 16) { if (tid < 8) { __update(dists, dists_i, tid, tid + 8); } __syncthreads(); } if (block_size >= 8) { if (tid < 4) { __update(dists, dists_i, tid, tid + 4); } __syncthreads(); } if (block_size >= 4) { if (tid < 2) { __update(dists, dists_i, tid, tid + 2); } __syncthreads(); } if (block_size >= 2) { if (tid < 1) { __update(dists, dists_i, tid, tid + 1); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs) { unsigned int n_threads = opt_n_threads(n); switch (n_threads) { case 1024: furthestsampling_cuda_kernel<1024><<>>(b, n, m, dataset, temp, idxs); break; case 512: furthestsampling_cuda_kernel<512><<>>(b, n, m, dataset, temp, idxs); break; case 256: furthestsampling_cuda_kernel<256><<>>(b, n, m, dataset, temp, idxs); break; case 128: furthestsampling_cuda_kernel<128><<>>(b, n, m, dataset, temp, idxs); break; case 64: furthestsampling_cuda_kernel<64><<>>(b, n, m, dataset, temp, idxs); break; case 32: furthestsampling_cuda_kernel<32><<>>(b, n, m, dataset, temp, idxs); break; case 16: furthestsampling_cuda_kernel<16><<>>(b, n, m, dataset, temp, idxs); break; case 8: furthestsampling_cuda_kernel<8><<>>(b, n, m, dataset, temp, idxs); break; case 4: furthestsampling_cuda_kernel<4><<>>(b, n, m, dataset, temp, idxs); break; case 2: furthestsampling_cuda_kernel<2><<>>(b, n, m, dataset, temp, idxs); break; case 1: furthestsampling_cuda_kernel<1><<>>(b, n, m, dataset, temp, idxs); break; default: furthestsampling_cuda_kernel<512><<>>(b, n, m, dataset, temp, idxs); } } ================================================ FILE: lib/pointops/src/sampling/sampling_cuda_kernel.h ================================================ #ifndef _SAMPLING_CUDA_KERNEL #define _SAMPLING_CUDA_KERNEL #include #include #include void gathering_forward_cuda(int b, int c, int n, int m, at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor); void gathering_backward_cuda(int b, int c, int n, int m, at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor); void furthestsampling_cuda(int b, int n, int m, at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor); #ifdef __cplusplus extern "C" { #endif void gathering_forward_cuda_launcher(int b, int c, int n, int m, const float *points, const int *idx, float *out); void gathering_backward_cuda_launcher(int b, int c, int n, int m, const float *grad_out, const int *idx, float *grad_points); void furthestsampling_cuda_launcher(int b, int n, int m, const float *dataset, float *temp, int *idxs); #ifdef __cplusplus } #endif #endif ================================================ FILE: lib/sync_bn/__init__.py ================================================ # -*- coding: utf-8 -*- # File : __init__.py # Author : Jiayuan Mao # Email : maojiayuan@gmail.com # Date : 27/01/2018 # # This file is part of Synchronized-BatchNorm-PyTorch. # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # Distributed under MIT License. from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d from .replicate import DataParallelWithCallback, patch_replication_callback ================================================ FILE: lib/sync_bn/batchnorm.py ================================================ # -*- coding: utf-8 -*- # File : batchnorm.py # Author : Jiayuan Mao # Email : maojiayuan@gmail.com # Date : 27/01/2018 # # This file is part of Synchronized-BatchNorm-PyTorch. # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # Distributed under MIT License. import collections import torch import torch.nn.functional as F from torch.nn.modules.batchnorm import _BatchNorm from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast from .comm import SyncMaster __all__ = ['SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d'] def _sum_ft(tensor): """sum over the first and last dimention""" return tensor.sum(dim=0).sum(dim=-1) def _unsqueeze_ft(tensor): """add new dementions at the front and the tail""" return tensor.unsqueeze(0).unsqueeze(-1) _ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size']) _MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std']) class _SynchronizedBatchNorm(_BatchNorm): def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True): super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine) self._sync_master = SyncMaster(self._data_parallel_master) self._is_parallel = False self._parallel_id = None self._slave_pipe = None def forward(self, input): # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation. if not (self._is_parallel and self.training): return F.batch_norm( input, self.running_mean, self.running_var, self.weight, self.bias, self.training, self.momentum, self.eps) # Resize the input to (B, C, -1). input_shape = input.size() input = input.view(input.size(0), self.num_features, -1) # Compute the sum and square-sum. sum_size = input.size(0) * input.size(2) input_sum = _sum_ft(input) input_ssum = _sum_ft(input ** 2) # Reduce-and-broadcast the statistics. if self._parallel_id == 0: mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size)) else: mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size)) # Compute the output. if self.affine: # MJY:: Fuse the multiplication for speed. output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias) else: output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std) # Reshape it. return output.view(input_shape) def __data_parallel_replicate__(self, ctx, copy_id): self._is_parallel = True self._parallel_id = copy_id # parallel_id == 0 means master device. if self._parallel_id == 0: ctx.sync_master = self._sync_master else: self._slave_pipe = ctx.sync_master.register_slave(copy_id) def _data_parallel_master(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" # Always using same "device order" makes the ReduceAdd operation faster. # Thanks to:: Tete Xiao (http://tetexiao.com/) intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) to_reduce = [i[1][:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] # flatten target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) outputs = [] for i, rec in enumerate(intermediates): outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2]))) return outputs def _compute_mean_std(self, sum_, ssum, size): """Compute the mean and standard-deviation with sum and square-sum. This method also maintains the moving average on the master device.""" assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.' mean = sum_ / size sumvar = ssum - sum_ * mean unbias_var = sumvar / (size - 1) bias_var = sumvar / size self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data return mean, bias_var.clamp(self.eps) ** -0.5 class SynchronizedBatchNorm1d(_SynchronizedBatchNorm): r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a mini-batch. .. math:: y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta This module differs from the built-in PyTorch BatchNorm1d as the mean and standard-deviation are reduced across all devices during training. For example, when one uses `nn.DataParallel` to wrap the network during training, PyTorch's implementation normalize the tensor on each device using the statistics only on that device, which accelerated the computation and is also easy to implement, but the statistics might be inaccurate. Instead, in this synchronized version, the statistics will be computed over all training samples distributed on multiple devices. Note that, for one-GPU or CPU-only case, this module behaves exactly same as the built-in PyTorch implementation. The mean and standard-deviation are calculated per-dimension over the mini-batches and gamma and beta are learnable parameter vectors of size C (where C is the input size). During training, this layer keeps a running estimate of its computed mean and variance. The running sum is kept with a default momentum of 0.1. During evaluation, this running mean/variance is used for normalization. Because the BatchNorm is done over the `C` dimension, computing statistics on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm Args: num_features: num_features from an expected input of size `batch_size x num_features [x width]` eps: a value added to the denominator for numerical stability. Default: 1e-5 momentum: the value used for the running_mean and running_var computation. Default: 0.1 affine: a boolean value that when set to ``True``, gives the layer learnable affine parameters. Default: ``True`` Shape: - Input: :math:`(N, C)` or :math:`(N, C, L)` - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) Examples: >>> # With Learnable Parameters >>> m = SynchronizedBatchNorm1d(100) >>> # Without Learnable Parameters >>> m = SynchronizedBatchNorm1d(100, affine=False) >>> input = torch.autograd.Variable(torch.randn(20, 100)) >>> output = m(input) """ def _check_input_dim(self, input): if input.dim() != 2 and input.dim() != 3: raise ValueError('expected 2D or 3D input (got {}D input)' .format(input.dim())) super(SynchronizedBatchNorm1d, self)._check_input_dim(input) class SynchronizedBatchNorm2d(_SynchronizedBatchNorm): r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch of 3d inputs .. math:: y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta This module differs from the built-in PyTorch BatchNorm2d as the mean and standard-deviation are reduced across all devices during training. For example, when one uses `nn.DataParallel` to wrap the network during training, PyTorch's implementation normalize the tensor on each device using the statistics only on that device, which accelerated the computation and is also easy to implement, but the statistics might be inaccurate. Instead, in this synchronized version, the statistics will be computed over all training samples distributed on multiple devices. Note that, for one-GPU or CPU-only case, this module behaves exactly same as the built-in PyTorch implementation. The mean and standard-deviation are calculated per-dimension over the mini-batches and gamma and beta are learnable parameter vectors of size C (where C is the input size). During training, this layer keeps a running estimate of its computed mean and variance. The running sum is kept with a default momentum of 0.1. During evaluation, this running mean/variance is used for normalization. Because the BatchNorm is done over the `C` dimension, computing statistics on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm Args: num_features: num_features from an expected input of size batch_size x num_features x height x width eps: a value added to the denominator for numerical stability. Default: 1e-5 momentum: the value used for the running_mean and running_var computation. Default: 0.1 affine: a boolean value that when set to ``True``, gives the layer learnable affine parameters. Default: ``True`` Shape: - Input: :math:`(N, C, H, W)` - Output: :math:`(N, C, H, W)` (same shape as input) Examples: >>> # With Learnable Parameters >>> m = SynchronizedBatchNorm2d(100) >>> # Without Learnable Parameters >>> m = SynchronizedBatchNorm2d(100, affine=False) >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45)) >>> output = m(input) """ def _check_input_dim(self, input): if input.dim() != 4: raise ValueError('expected 4D input (got {}D input)' .format(input.dim())) super(SynchronizedBatchNorm2d, self)._check_input_dim(input) class SynchronizedBatchNorm3d(_SynchronizedBatchNorm): r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch of 4d inputs .. math:: y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta This module differs from the built-in PyTorch BatchNorm3d as the mean and standard-deviation are reduced across all devices during training. For example, when one uses `nn.DataParallel` to wrap the network during training, PyTorch's implementation normalize the tensor on each device using the statistics only on that device, which accelerated the computation and is also easy to implement, but the statistics might be inaccurate. Instead, in this synchronized version, the statistics will be computed over all training samples distributed on multiple devices. Note that, for one-GPU or CPU-only case, this module behaves exactly same as the built-in PyTorch implementation. The mean and standard-deviation are calculated per-dimension over the mini-batches and gamma and beta are learnable parameter vectors of size C (where C is the input size). During training, this layer keeps a running estimate of its computed mean and variance. The running sum is kept with a default momentum of 0.1. During evaluation, this running mean/variance is used for normalization. Because the BatchNorm is done over the `C` dimension, computing statistics on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm or Spatio-temporal BatchNorm Args: num_features: num_features from an expected input of size batch_size x num_features x depth x height x width eps: a value added to the denominator for numerical stability. Default: 1e-5 momentum: the value used for the running_mean and running_var computation. Default: 0.1 affine: a boolean value that when set to ``True``, gives the layer learnable affine parameters. Default: ``True`` Shape: - Input: :math:`(N, C, D, H, W)` - Output: :math:`(N, C, D, H, W)` (same shape as input) Examples: >>> # With Learnable Parameters >>> m = SynchronizedBatchNorm3d(100) >>> # Without Learnable Parameters >>> m = SynchronizedBatchNorm3d(100, affine=False) >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10)) >>> output = m(input) """ def _check_input_dim(self, input): if input.dim() != 5: raise ValueError('expected 5D input (got {}D input)' .format(input.dim())) super(SynchronizedBatchNorm3d, self)._check_input_dim(input) ================================================ FILE: lib/sync_bn/comm.py ================================================ # -*- coding: utf-8 -*- # File : comm.py # Author : Jiayuan Mao # Email : maojiayuan@gmail.com # Date : 27/01/2018 # # This file is part of Synchronized-BatchNorm-PyTorch. # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # Distributed under MIT License. import queue import collections import threading __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster'] class FutureResult(object): """A thread-safe future implementation. Used only as one-to-one pipe.""" def __init__(self): self._result = None self._lock = threading.Lock() self._cond = threading.Condition(self._lock) def put(self, result): with self._lock: assert self._result is None, 'Previous result has\'t been fetched.' self._result = result self._cond.notify() def get(self): with self._lock: if self._result is None: self._cond.wait() res = self._result self._result = None return res _MasterRegistry = collections.namedtuple('MasterRegistry', ['result']) _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result']) class SlavePipe(_SlavePipeBase): """Pipe for master-slave communication.""" def run_slave(self, msg): self.queue.put((self.identifier, msg)) ret = self.result.get() self.queue.put(True) return ret class SyncMaster(object): """An abstract `SyncMaster` object. - During the replication, as the data parallel will trigger an callback of each module, all slave devices should call `register(id)` and obtain an `SlavePipe` to communicate with the master. - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected, and passed to a registered callback. - After receiving the messages, the master device should gather the information and determine to message passed back to each slave devices. """ def __init__(self, master_callback): """ Args: master_callback: a callback to be invoked after having collected messages from slave devices. """ self._master_callback = master_callback self._queue = queue.Queue() self._registry = collections.OrderedDict() self._activated = False def __getstate__(self): return {'master_callback': self._master_callback} def __setstate__(self, state): self.__init__(state['master_callback']) def register_slave(self, identifier): """ Register an slave device. Args: identifier: an identifier, usually is the device id. Returns: a `SlavePipe` object which can be used to communicate with the master device. """ if self._activated: assert self._queue.empty(), 'Queue is not clean before next initialization.' self._activated = False self._registry.clear() future = FutureResult() self._registry[identifier] = _MasterRegistry(future) return SlavePipe(identifier, self._queue, future) def run_master(self, master_msg): """ Main entry for the master device in each forward pass. The messages were first collected from each devices (including the master device), and then an callback will be invoked to compute the message to be sent back to each devices (including the master device). Args: master_msg: the message that the master want to send to itself. This will be placed as the first message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example. Returns: the message to be sent back to the master device. """ self._activated = True intermediates = [(0, master_msg)] for i in range(self.nr_slaves): intermediates.append(self._queue.get()) results = self._master_callback(intermediates) assert results[0][0] == 0, 'The first result should belongs to the master.' for i, res in results: if i == 0: continue self._registry[i].result.put(res) for i in range(self.nr_slaves): assert self._queue.get() is True return results[0][1] @property def nr_slaves(self): return len(self._registry) ================================================ FILE: lib/sync_bn/replicate.py ================================================ # -*- coding: utf-8 -*- # File : replicate.py # Author : Jiayuan Mao # Email : maojiayuan@gmail.com # Date : 27/01/2018 # # This file is part of Synchronized-BatchNorm-PyTorch. # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # Distributed under MIT License. import functools from torch.nn.parallel.data_parallel import DataParallel __all__ = [ 'CallbackContext', 'execute_replication_callbacks', 'DataParallelWithCallback', 'patch_replication_callback' ] class CallbackContext(object): pass def execute_replication_callbacks(modules): """ Execute an replication callback `__data_parallel_replicate__` on each module created by original replication. The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` Note that, as all modules are isomorphism, we assign each sub-module with a context (shared among multiple copies of this module on different devices). Through this context, different copies can share some information. We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback of any slave copies. """ master_copy = modules[0] nr_modules = len(list(master_copy.modules())) ctxs = [CallbackContext() for _ in range(nr_modules)] for i, module in enumerate(modules): for j, m in enumerate(module.modules()): if hasattr(m, '__data_parallel_replicate__'): m.__data_parallel_replicate__(ctxs[j], i) class DataParallelWithCallback(DataParallel): """ Data Parallel with a replication callback. An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by original `replicate` functions. The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` Examples: > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) # sync_bn.__data_parallel_replicate__ will be invoked. """ def replicate(self, module, device_ids): modules = super(DataParallelWithCallback, self).replicate(module, device_ids) execute_replication_callbacks(modules) return modules def patch_replication_callback(data_parallel): """ Monkey-patch an existing `DataParallel` object. Add the replication callback. Useful when you have customized `DataParallel` implementation. Examples: > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) > sync_bn = DataParallel(sync_bn, device_ids=[0, 1]) > patch_replication_callback(sync_bn) # this is equivalent to > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) """ assert isinstance(data_parallel, DataParallel) old_replicate = data_parallel.replicate @functools.wraps(old_replicate) def new_replicate(module, device_ids): modules = old_replicate(module, device_ids) execute_replication_callbacks(modules) return modules data_parallel.replicate = new_replicate ================================================ FILE: lib/sync_bn/unittest.py ================================================ # -*- coding: utf-8 -*- # File : unittest.py # Author : Jiayuan Mao # Email : maojiayuan@gmail.com # Date : 27/01/2018 # # This file is part of Synchronized-BatchNorm-PyTorch. # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # Distributed under MIT License. import unittest import numpy as np from torch.autograd import Variable def as_numpy(v): if isinstance(v, Variable): v = v.data return v.cpu().numpy() class TorchTestCase(unittest.TestCase): def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3): npa, npb = as_numpy(a), as_numpy(b) self.assertTrue( np.allclose(npa, npb, atol=atol), 'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max()) ) ================================================ FILE: model/__init__.py ================================================ ================================================ FILE: model/pointnet/pointnet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class STN3D(nn.Module): def __init__(self, c): super(STN3D, self).__init__() self.c = c self.conv1 = nn.Conv1d(self.c, 64, 1) self.conv2 = nn.Conv1d(64, 128, 1) self.conv3 = nn.Conv1d(128, 1024, 1) self.mp = nn.AdaptiveMaxPool1d(1) self.fc1 = nn.Linear(1024, 512) self.fc2 = nn.Linear(512, 256) self.fc3 = nn.Linear(256, self.c*self.c) self.bn1 = nn.BatchNorm1d(64) self.bn2 = nn.BatchNorm1d(128) self.bn3 = nn.BatchNorm1d(1024) self.bn4 = nn.BatchNorm1d(512) self.bn5 = nn.BatchNorm1d(256) def forward(self, x): batch_size = x.size()[0] x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = F.relu(self.bn3(self.conv3(x))) x = self.mp(x) x = x.view(-1, 1024) x = F.relu(self.bn4(self.fc1(x))) x = F.relu(self.bn5(self.fc2(x))) x = self.fc3(x) iden = torch.eye(self.c).view(1, -1).repeat(batch_size, 1) if x.is_cuda: iden = iden.cuda() x = x + iden x = x.view(-1, self.c, self.c) return x class PointNetFeat(nn.Module): def __init__(self, c=3, global_feat=True): super(PointNetFeat, self).__init__() self.global_feat = global_feat self.stn1 = STN3D(c) self.conv1 = nn.Conv1d(c, 64, 1) self.conv2 = nn.Conv1d(64, 64, 1) self.stn2 = STN3D(64) self.conv3 = nn.Conv1d(64, 64, 1) self.conv4 = nn.Conv1d(64, 128, 1) self.conv5 = nn.Conv1d(128, 1024, 1) self.mp = nn.AdaptiveMaxPool1d(1) self.bn1 = nn.BatchNorm1d(64) self.bn2 = nn.BatchNorm1d(64) self.bn3 = nn.BatchNorm1d(64) self.bn4 = nn.BatchNorm1d(128) self.bn5 = nn.BatchNorm1d(1024) def forward(self, x): stn1 = self.stn1(x) x = torch.bmm(stn1, x) x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) stn2 = self.stn2(x) x_tmp = torch.bmm(stn2, x) x = F.relu(self.bn3(self.conv3(x_tmp))) x = F.relu(self.bn4(self.conv4(x))) x = F.relu(self.bn5(self.conv5(x))) x = self.mp(x) x = x.view(-1, 1024) if not self.global_feat: x = x.view(-1, 1024, 1).repeat(1, 1, x_tmp.size()[2]) x = torch.cat([x_tmp, x], 1) return x class PointNetCls(nn.Module): def __init__(self, c=3, k=40, dropout=0.3, sync_bn=False): super(PointNetCls, self).__init__() self.feat = PointNetFeat(c, global_feat=True) self.fc1 = nn.Linear(1024, 512) self.fc2 = nn.Linear(512, 256) self.fc3 = nn.Linear(256, k) self.bn1 = nn.BatchNorm1d(512) self.bn2 = nn.BatchNorm1d(256) self.dropout = nn.Dropout(p=dropout) def forward(self, x): x = x.transpose(1, 2) x = self.feat(x) x = F.relu(self.bn1(self.fc1(x))) x = F.relu(self.bn2(self.fc2(x))) x = self.dropout(x) x = self.fc3(x) return x # Segmentation with 9 channels input XYZ, RGB and normalized location to the room (from 0 to 1), with STN3D on input and feature class PointNetSeg(nn.Module): def __init__(self, c=9, k=13, sync_bn=False): super(PointNetSeg, self).__init__() self.feat = PointNetFeat(c, global_feat=False) self.conv1 = nn.Conv1d(1088, 512, 1) self.conv2 = nn.Conv1d(512, 256, 1) self.conv3 = nn.Conv1d(256, 128, 1) self.conv4 = nn.Conv1d(128, 128, 1) self.conv5 = nn.Conv1d(128, k, 1) self.bn1 = nn.BatchNorm1d(512) self.bn2 = nn.BatchNorm1d(256) self.bn3 = nn.BatchNorm1d(128) self.bn4 = nn.BatchNorm1d(128) def forward(self, x): x = x.transpose(1, 2) x = self.feat(x) x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = F.relu(self.bn3(self.conv3(x))) x = F.relu(self.bn4(self.conv4(x))) x = self.conv5(x) return x if __name__ == '__main__': import os os.environ["CUDA_VISIBLE_DEVICES"] = '0' sim_data = torch.rand(16, 2048, 3) trans = STN3D(c=3) out = trans(sim_data.transpose(1, 2)) print('stn', out.size()) point_feat = PointNetFeat(global_feat=True) out = point_feat(sim_data.transpose(1, 2)) print('global feat', out.size()) point_feat = PointNetFeat(global_feat=False) out = point_feat(sim_data.transpose(1, 2)) print('point feat', out.size()) cls = PointNetCls(c=3, k=40) out = cls(sim_data) print('class', out.size()) sim_data = torch.rand(16, 2048, 9) seg = PointNetSeg(c=9, k=13) out = seg(sim_data) print('seg', out.size()) ================================================ FILE: model/pointnet2/pointnet2_modules.py ================================================ from typing import List import torch import torch.nn as nn import torch.nn.functional as F from lib.pointops.functions import pointops from util import pt_util class _PointNet2SAModuleBase(nn.Module): def __init__(self): super().__init__() self.npoint = None self.groupers = None self.mlps = None def forward(self, xyz: torch.Tensor, features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): r""" Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor of the xyz coordinates of the features features : torch.Tensor (B, N, C) tensor of the descriptors of the the features Returns ------- new_xyz : torch.Tensor (B, npoint, 3) tensor of the new features' xyz new_features : torch.Tensor (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors """ new_features_list = [] xyz_trans = xyz.transpose(1, 2).contiguous() new_xyz = pointops.gathering( xyz_trans, pointops.furthestsampling(xyz, self.npoint) ).transpose(1, 2).contiguous() if self.npoint is not None else None for i in range(len(self.groupers)): new_features = self.groupers[i](xyz, new_xyz, features) # (B, C, npoint, nsample) new_features = self.mlps[i](new_features) # (B, mlp[-1], npoint, nsample) new_features = F.max_pool2d(new_features, kernel_size=[1, new_features.size(3)]) # (B, mlp[-1], npoint, 1) new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1) class PointNet2SAModuleMSG(_PointNet2SAModuleBase): r"""Pointnet set abstrction layer with multiscale grouping Parameters ---------- npoint : int Number of features radii : list of float32 list of radii to group with nsamples : list of int32 Number of samples in each ball query mlps : list of list of int32 Spec of the pointnet_old before the global max_pool for each scale bn : bool Use batchnorm """ def __init__(self, *, npoint: int, radii: List[float], nsamples: List[int], mlps: List[List[int]], bn: bool = True, use_xyz: bool = True): super().__init__() assert len(radii) == len(nsamples) == len(mlps) self.npoint = npoint self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() for i in range(len(radii)): radius = radii[i] nsample = nsamples[i] self.groupers.append( pointops.QueryAndGroup(radius, nsample, use_xyz=use_xyz) if npoint is not None else pointops.GroupAll(use_xyz) ) mlp_spec = mlps[i] if use_xyz: mlp_spec[0] += 3 self.mlps.append(pt_util.SharedMLP(mlp_spec, bn=bn)) class PointNet2SAModule(PointNet2SAModuleMSG): r"""Pointnet set abstrction layer Parameters ---------- npoint : int Number of features radius : float Radius of ball nsample : int Number of samples in the ball query mlp : list Spec of the pointnet_old before the global max_pool bn : bool Use batchnorm """ def __init__(self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None, bn: bool = True, use_xyz: bool = True): super().__init__(mlps=[mlp], npoint=npoint, radii=[radius], nsamples=[nsample], bn=bn, use_xyz=use_xyz) class PointNet2FPModule(nn.Module): r"""Propigates the features of one set to another Parameters ---------- mlp : list Pointnet module parameters bn : bool Use batchnorm """ def __init__(self, *, mlp: List[int], bn: bool = True): super().__init__() self.mlp = pt_util.SharedMLP(mlp, bn=bn) def forward(self, unknown: torch.Tensor, known: torch.Tensor, unknow_feats: torch.Tensor, known_feats: torch.Tensor) -> torch.Tensor: r""" Parameters ---------- unknown : torch.Tensor (B, n, 3) tensor of the xyz positions of the unknown features known : torch.Tensor (B, m, 3) tensor of the xyz positions of the known features unknow_feats : torch.Tensor (B, C1, n) tensor of the features to be propigated to known_feats : torch.Tensor (B, C2, m) tensor of features to be propigated Returns ------- new_features : torch.Tensor (B, mlp[-1], n) tensor of the features of the unknown features """ if known is not None: dist, idx = pointops.nearestneighbor(unknown, known) dist_recip = 1.0 / (dist + 1e-8) norm = torch.sum(dist_recip, dim=2, keepdim=True) weight = dist_recip / norm interpolated_feats = pointops.interpolation(known_feats, idx, weight) else: interpolated_feats = known_feats.expand(*known_feats.size()[0:2], unknown.size(1)) if unknow_feats is not None: new_features = torch.cat([interpolated_feats, unknow_feats], dim=1) # (B, C2 + C1, n) else: new_features = interpolated_feats return self.mlp(new_features.unsqueeze(-1)).squeeze(-1) if __name__ == "__main__": torch.manual_seed(1) torch.cuda.manual_seed_all(1) xyz = torch.randn(2, 9, 3, requires_grad=True).cuda() xyz_feats = torch.randn(2, 9, 6, requires_grad=True).cuda() test_module = PointNet2SAModuleMSG(npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]]) test_module.cuda() print(test_module(xyz, xyz_feats)) # test_module = PointNet2FPModule(mlp=[6, 6]) # test_module.cuda() # from torch.autograd import gradcheck # inputs = (xyz, xyz, None, xyz_feats) # test = gradcheck(test_module, inputs, eps=1e-6, atol=1e-4) # print(test) for _ in range(1): _, new_features = test_module(xyz, xyz_feats) new_features.backward(torch.cuda.FloatTensor(*new_features.size()).fill_(1)) print(new_features) print(xyz.grad) ================================================ FILE: model/pointnet2/pointnet2_seg.py ================================================ from collections import namedtuple import torch import torch.nn as nn from model.pointnet2.pointnet2_modules import PointNet2SAModule, PointNet2SAModuleMSG, PointNet2FPModule from util import pt_util class PointNet2SSGSeg(nn.Module): r""" PointNet2 with single-scale grouping Semantic segmentation network that uses feature propogation layers Parameters ---------- k: int Number of semantics classes to predict over -- size of softmax classifier that run for each point c: int = 6 Number of input channels in the feature descriptor for each point. If the point cloud is Nx9, this value should be 6 as in an Nx9 point cloud, 3 of the channels are xyz, and 6 are feature descriptors use_xyz: bool = True Whether or not to use the xyz position of a point as a feature """ def __init__(self, c=3, k=13, use_xyz=True): super().__init__() self.SA_modules = nn.ModuleList() self.SA_modules.append(PointNet2SAModule(npoint=1024, nsample=32, mlp=[c, 32, 32, 64], use_xyz=use_xyz)) self.SA_modules.append(PointNet2SAModule(npoint=256, nsample=32, mlp=[64, 64, 64, 128], use_xyz=use_xyz)) self.SA_modules.append(PointNet2SAModule(npoint=64, nsample=32, mlp=[128, 128, 128, 256], use_xyz=use_xyz)) self.SA_modules.append(PointNet2SAModule(npoint=16, nsample=32, mlp=[256, 256, 256, 512], use_xyz=use_xyz)) self.FP_modules = nn.ModuleList() self.FP_modules.append(PointNet2FPModule(mlp=[128 + c, 128, 128, 128])) self.FP_modules.append(PointNet2FPModule(mlp=[256 + 64, 256, 128])) self.FP_modules.append(PointNet2FPModule(mlp=[256 + 128, 256, 256])) self.FP_modules.append(PointNet2FPModule(mlp=[512 + 256, 256, 256])) self.FC_layer = nn.Sequential(pt_util.Conv2d(128, 128, bn=True), nn.Dropout(), pt_util.Conv2d(128, k, activation=None)) def _break_up_pc(self, pc): xyz = pc[..., 0:3].contiguous() features = (pc[..., 3:].transpose(1, 2).contiguous() if pc.size(-1) > 3 else None) return xyz, features def forward(self, pointcloud: torch.cuda.FloatTensor): r""" Forward pass of the network Parameters ---------- pointcloud: Variable(torch.cuda.FloatTensor) (B, N, 3 + input_channels) tensor Point cloud to run predicts on Each point in the point-cloud MUST be formated as (x, y, z, features...) """ xyz, features = self._break_up_pc(pointcloud) l_xyz, l_features = [xyz], [features] for i in range(len(self.SA_modules)): li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i]) l_xyz.append(li_xyz) l_features.append(li_features) for i in range(-1, -(len(self.FP_modules) + 1), -1): l_features[i - 1] = self.FP_modules[i](l_xyz[i - 1], l_xyz[i], l_features[i - 1], l_features[i]) # return self.FC_layer(l_features[0]) return self.FC_layer(l_features[0].unsqueeze(-1)).squeeze(-1) class PointNet2MSGSeg(PointNet2SSGSeg): r""" PointNet2 with multi-scale grouping Semantic segmentation network that uses feature propogation layers Parameters ---------- k: int Number of semantics classes to predict over -- size of softmax classifier that run for each point c: int = 6 Number of input channels in the feature descriptor for each point. If the point cloud is Nx9, this value should be 6 as in an Nx9 point cloud, 3 of the channels are xyz, and 6 are feature descriptors use_xyz: bool = True Whether or not to use the xyz position of a point as a feature """ def __init__(self, k, c=6, use_xyz=True): super().__init__() self.SA_modules = nn.ModuleList() c_in = c self.SA_modules.append(PointNet2SAModuleMSG(npoint=1024, radii=[0.05, 0.1], nsamples=[16, 32], mlps=[[c_in, 16, 16, 32], [c_in, 32, 32, 64]], use_xyz=use_xyz )) c_out_0 = 32 + 64 c_in = c_out_0 self.SA_modules.append(PointNet2SAModuleMSG(npoint=256, radii=[0.1, 0.2], nsamples=[16, 32], mlps=[[c_in, 64, 64, 128], [c_in, 64, 96, 128]], use_xyz=use_xyz)) c_out_1 = 128 + 128 c_in = c_out_1 self.SA_modules.append(PointNet2SAModuleMSG(npoint=64, radii=[0.2, 0.4], nsamples=[16, 32], mlps=[[c_in, 128, 196, 256], [c_in, 128, 196, 256]], use_xyz=use_xyz)) c_out_2 = 256 + 256 c_in = c_out_2 self.SA_modules.append(PointNet2SAModuleMSG(npoint=16, radii=[0.4, 0.8], nsamples=[16, 32], mlps=[[c_in, 256, 256, 512], [c_in, 256, 384, 512]], use_xyz=use_xyz)) c_out_3 = 512 + 512 self.FP_modules = nn.ModuleList() self.FP_modules.append(PointNet2FPModule(mlp=[256 + c, 128, 128])) self.FP_modules.append(PointNet2FPModule(mlp=[512 + c_out_0, 256, 256])) self.FP_modules.append(PointNet2FPModule(mlp=[512 + c_out_1, 512, 512])) self.FP_modules.append(PointNet2FPModule(mlp=[c_out_3 + c_out_2, 512, 512])) self.FC_layer = nn.Sequential(pt_util.Conv2d(128, 128, bn=True), nn.Dropout(), pt_util.Conv2d(128, k, activation=None)) def model_fn_decorator(criterion): ModelReturn = namedtuple("ModelReturn", ['preds', 'loss', 'acc']) def model_fn(model, data, eval=False): with torch.set_grad_enabled(not eval): inputs, labels = data inputs = inputs.cuda(async=True) labels = labels.cuda(async=True) preds = model(inputs) loss = criterion(preds, labels) _, classes = torch.max(preds, 1) acc = (classes == labels).float().sum() / labels.numel() return ModelReturn(preds, loss, {"acc": acc.item(), 'loss': loss.item()}) return model_fn if __name__ == "__main__": import torch.optim as optim B, N, C, K = 2, 4096, 3, 13 inputs = torch.randn(B, N, 6).cuda() labels = torch.randint(0, 3, (B, N)).cuda() model = PointNet2SSGSeg(c=C, k=K).cuda() optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing SSGCls with xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() model = PointNet2SSGSeg(c=C, k=K, use_xyz=False).cuda() optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing SSGCls without xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() model = PointNet2MSGSeg(c=C, k=K).cuda() optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing MSGCls with xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() model = PointNet2MSGSeg(c=C, k=K, use_xyz=False).cuda() optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing MSGCls without xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() ================================================ FILE: model/pointweb/pointweb_module.py ================================================ from typing import List import torch import torch.nn as nn import torch.nn.functional as F from lib.pointops.functions import pointops from util import pt_util class _AFAModule(nn.Module): def __init__(self, mlp, use_softmax=False): r""" :param mlp: mlp for learning weight mode: transformation or aggregation """ super().__init__() self.mlp = mlp self.use_softmax = use_softmax def forward(self, feature: torch.Tensor) -> torch.Tensor: r""" Parameters ---------- features : torch.Tensor (B, C, N, M) or (B, C, N) Returns ------- new_features : torch.Tensor transformation: (B, C, N, M) or (B, C, N) aggregation: (B, C, N) or (B, C) """ B, C, N, M = feature.size() feature = feature.transpose(1, 2).contiguous().view(B * N, C, M, 1).repeat(1, 1, 1, M) # (BN, C, M, M) feature = feature - feature.transpose(2, 3).contiguous() + torch.mul(feature, torch.eye(M).view(1, 1, M, M).cuda()) # (BN, C, M, M) weight = self.mlp(feature) if self.use_softmax: weight = F.softmax(weight, -1) feature = (feature * weight).sum(-1).view(B, N, C, M).transpose(1, 2).contiguous() # (B, C, N, M) return feature class _PointWebSAModuleBase(nn.Module): def __init__(self): super().__init__() self.npoint = None self.grouper = None self.mlp = None self.afa = None def forward(self, xyz: torch.Tensor, features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): r""" Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor of the xyz coordinates of the features features : torch.Tensor (B, C, N) tensor of the descriptors of the the features Returns ------- new_xyz : torch.Tensor (B, npoint, 3) tensor of the new features' xyz new_features : torch.Tensor (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors """ new_features_list = [] xyz_trans = xyz.transpose(1, 2).contiguous() new_xyz = pointops.gathering( xyz_trans, pointops.furthestsampling(xyz, self.npoint) ).transpose(1, 2).contiguous() if self.npoint is not None else None new_features = self.grouper(xyz, new_xyz, features) # (B, C, npoint, nsample) if new_features.shape[2] != 1: # for npoint is none new_features = new_features + self.afa(new_features) # (B, C, npoint, nsample) new_features = self.mlp(new_features) new_features = F.max_pool2d(new_features, kernel_size=[1, new_features.size(3)]).squeeze(-1) # (B, mlp[-1], npoint) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1) class PointWebSAModule(_PointWebSAModuleBase): r"""Pointnet set abstrction layer with multiscale grouping Parameters ---------- npoint : int Number of features nsample : int32 Number of sample mlps : list of int32 Spec of the MLP before the global max_pool mlps2: list of list of int32 Spec of the MLP for AFA bn : bool Use batchnorm """ def __init__(self, *, npoint: int = None, nsample: int = None, mlp: List[int] = None, mlp2: List[int] = None, bn: bool = True, use_xyz: bool = True, use_bn = True): super().__init__() self.npoint = npoint self.grouper = pointops.QueryAndGroup(nsample=nsample, use_xyz=use_xyz) if npoint is not None else pointops.GroupAll(use_xyz) if use_xyz: mlp[0] += 3 if npoint is not None: mlp_tmp = pt_util.SharedMLP([mlp[0]] + mlp2, bn=use_bn) mlp_tmp.add_module('weight', (pt_util.SharedMLP([mlp2[-1], mlp[0]], bn=False, activation=None))) self.afa = _AFAModule(mlp=mlp_tmp) self.mlp = pt_util.SharedMLP(mlp, bn=bn) if __name__ == "__main__": torch.manual_seed(1) torch.cuda.manual_seed_all(1) c = 6 xyz = torch.randn(2, 8, 3, requires_grad=True).cuda() xyz_feats = torch.randn(2, 8, c, requires_grad=True).cuda() test_module = PointWebSAModule(npoint=2, nsample=6, mlp=[c, 32, 32], mlp2=[16, 16], use_bn=True) test_module.cuda() xyz_feats = xyz_feats.transpose(1, 2).contiguous() print(test_module) print(test_module(xyz, xyz_feats)) for _ in range(1): _, new_features = test_module(xyz, xyz_feats) new_features.backward(torch.cuda.FloatTensor(*new_features.size()).fill_(1)) print(new_features) print(xyz.grad) ================================================ FILE: model/pointweb/pointweb_seg.py ================================================ from collections import namedtuple import torch import torch.nn as nn from model.pointweb.pointweb_module import PointWebSAModule from model.pointnet2.pointnet2_modules import PointNet2FPModule from util import pt_util class PointWebSeg(nn.Module): r""" PointNet2 with single-scale grouping Semantic segmentation network that uses feature propogation layers Parameters ---------- k: int Number of semantics classes to predict over -- size of softmax classifier that run for each point c: int = 6 Number of input channels in the feature descriptor for each point. If the point cloud is Nx9, this value should be 6 as in an Nx9 point cloud, 3 of the channels are xyz, and 6 are feature descriptors use_xyz: bool = True Whether or not to use the xyz position of a point as a feature """ def __init__(self, c=3, k=13, use_xyz=True): super().__init__() self.SA_modules = nn.ModuleList() self.SA_modules.append(PointWebSAModule(npoint=1024, nsample=32, mlp=[c, 32, 32, 64], mlp2=[32, 32], use_xyz=use_xyz)) self.SA_modules.append(PointWebSAModule(npoint=256, nsample=32, mlp=[64, 64, 64, 128], mlp2=[32, 32], use_xyz=use_xyz)) self.SA_modules.append(PointWebSAModule(npoint=64, nsample=32, mlp=[128, 128, 128, 256], mlp2=[32, 32], use_xyz=use_xyz)) self.SA_modules.append(PointWebSAModule(npoint=16, nsample=32, mlp=[256, 256, 256, 512], mlp2=[32, 32], use_xyz=use_xyz)) self.FP_modules = nn.ModuleList() self.FP_modules.append(PointNet2FPModule(mlp=[128 + c, 128, 128, 128])) self.FP_modules.append(PointNet2FPModule(mlp=[256 + 64, 256, 128])) self.FP_modules.append(PointNet2FPModule(mlp=[256 + 128, 256, 256])) self.FP_modules.append(PointNet2FPModule(mlp=[512 + 256, 256, 256])) self.FC_layer = nn.Sequential(pt_util.Conv2d(128, 128, bn=True), nn.Dropout(), pt_util.Conv2d(128, k, activation=None)) def _break_up_pc(self, pc): xyz = pc[..., 0:3].contiguous() features = (pc[..., 3:].transpose(1, 2).contiguous() if pc.size(-1) > 3 else None) return xyz, features def forward(self, pointcloud: torch.cuda.FloatTensor): r""" Forward pass of the network Parameters ---------- pointcloud: Variable(torch.cuda.FloatTensor) (B, N, 3 + input_channels) tensor Point cloud to run predicts on Each point in the point-cloud MUST be formated as (x, y, z, features...) """ xyz, features = self._break_up_pc(pointcloud) l_xyz, l_features = [xyz], [features] for i in range(len(self.SA_modules)): li_xyz, li_features = self.SA_modules[i](l_xyz[i], l_features[i]) l_xyz.append(li_xyz) l_features.append(li_features) for i in range(-1, -(len(self.FP_modules) + 1), -1): l_features[i - 1] = self.FP_modules[i](l_xyz[i - 1], l_xyz[i], l_features[i - 1], l_features[i]) return self.FC_layer(l_features[0].unsqueeze(-1)).squeeze(-1) def model_fn_decorator(criterion): ModelReturn = namedtuple("ModelReturn", ['preds', 'loss', 'acc']) def model_fn(model, data, epoch=0, eval=False): with torch.set_grad_enabled(not eval): inputs, labels = data inputs = inputs.cuda(async=True) labels = labels.cuda(async=True) preds = model(inputs) loss = criterion(preds, labels) _, classes = torch.max(preds, 1) acc = (classes == labels).float().sum() / labels.numel() return ModelReturn(preds, loss, {"acc": acc.item(), 'loss': loss.item()}) return model_fn def model_fn_decorator(criterion): ModelReturn = namedtuple("ModelReturn", ['preds', 'loss', 'acc']) def model_fn(model, data, eval=False): with torch.set_grad_enabled(not eval): inputs, labels = data inputs = inputs.cuda(async=True) labels = labels.cuda(async=True) preds = model(inputs) loss = criterion(preds, labels) _, classes = torch.max(preds, 1) acc = (classes == labels).float().sum() / labels.numel() return ModelReturn(preds, loss, {"acc": acc.item(), 'loss': loss.item()}) return model_fn if __name__ == "__main__": import torch.optim as optim B, N, C, K = 2, 4096, 3, 13 inputs = torch.randn(B, N, 6).cuda() labels = torch.randint(0, 3, (B, N)).cuda() model = PointWebSeg(c=C, k=K).cuda() print(model) optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing SSGCls with xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() model = PointWebSeg(c=C, k=K, use_xyz=False).cuda() print(model) optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9, weight_decay=1e-4) print("Testing SSGCls without xyz") model_fn = model_fn_decorator(nn.CrossEntropyLoss()) for _ in range(5): optimizer.zero_grad() _, loss, _ = model_fn(model, (inputs, labels)) loss.backward() print(loss.item()) optimizer.step() ================================================ FILE: tool/test.sh ================================================ #!/bin/sh export PYTHONPATH=./ PYTHON=python dataset=$1 exp_name=$2 exp_dir=exp/${dataset}/${exp_name} model_dir=${exp_dir}/model config=config/${dataset}/${dataset}_${exp_name}.yaml mkdir -p ${model_dir} now=$(date +"%Y%m%d_%H%M%S") if [ ${dataset} = 's3dis' ] then cp tool/test.sh tool/test_s3dis.py ${config} ${exp_dir} $PYTHON tool/test_s3dis.py --config=${config} 2>&1 | tee ${model_dir}/test-$now.log elif [ ${dataset} = 'scannet' ] then cp tool/test.sh tool/test_scannet.py ${config} ${exp_dir} $PYTHON tool/test_scannet.py --config=${config} 2>&1 | tee ${model_dir}/test-$now.log fi ================================================ FILE: tool/test_s3dis.py ================================================ import os import time import random import numpy as np import logging import pickle import argparse import torch import torch.nn as nn import torch.nn.parallel import torch.optim import torch.utils.data from util import config from util.util import AverageMeter, intersectionAndUnion, check_makedirs random.seed(123) np.random.seed(123) def get_parser(): parser = argparse.ArgumentParser(description='PyTorch Point Cloud Classification / Semantic Segmentation') parser.add_argument('--config', type=str, default='config/s3dis/s3dis_pointweb.yaml', help='config file') parser.add_argument('opts', help='see config/s3dis/s3dis_pointweb.yaml for all options', default=None, nargs=argparse.REMAINDER) args = parser.parse_args() assert args.config is not None cfg = config.load_cfg_from_cfg_file(args.config) if args.opts is not None: cfg = config.merge_cfg_from_list(cfg, args.opts) return cfg def get_logger(): logger_name = "main-logger" logger = logging.getLogger(logger_name) logger.setLevel(logging.INFO) handler = logging.StreamHandler() fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s" handler.setFormatter(logging.Formatter(fmt)) logger.addHandler(handler) return logger def main(): global args, logger args = get_parser() logger = get_logger() logger.info(args) assert args.classes > 1 logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) if args.arch == 'pointnet_seg': from model.pointnet.pointnet import PointNetSeg as Model elif args.arch == 'pointnet2_seg': from model.pointnet2.pointnet2_seg import PointNet2SSGSeg as Model elif args.arch == 'pointweb_seg': from model.pointweb.pointweb_seg import PointWebSeg as Model else: raise Exception('architecture not supported yet'.format(args.arch)) model = Model(c=args.fea_dim, k=args.classes, use_xyz=args.use_xyz) model = torch.nn.DataParallel(model.cuda()) logger.info(model) criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label).cuda() names = [line.rstrip('\n') for line in open(args.names_path)] if os.path.isfile(args.model_path): logger.info("=> loading checkpoint '{}'".format(args.model_path)) checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint['state_dict'], strict=False) logger.info("=> loaded checkpoint '{}'".format(args.model_path)) else: raise RuntimeError("=> no checkpoint found at '{}'".format(args.model_path)) test(model, criterion, names) def data_prepare(room_path): room_data = np.load(room_path) points, labels = room_data[:, 0:6], room_data[:, 6] # xyzrgb, N*6; l, N coord_min, coord_max = np.amin(points, axis=0)[:3], np.amax(points, axis=0)[:3] stride = args.block_size * args.stride_rate grid_x = int(np.ceil(float(coord_max[0] - coord_min[0] - args.block_size) / stride) + 1) grid_y = int(np.ceil(float(coord_max[1] - coord_min[1] - args.block_size) / stride) + 1) data_room, label_room, index_room = np.array([]), np.array([]), np.array([]) for index_y in range(0, grid_y): for index_x in range(0, grid_x): s_x = coord_min[0] + index_x * stride e_x = min(s_x + args.block_size, coord_max[0]) s_x = e_x - args.block_size s_y = coord_min[1] + index_y * stride e_y = min(s_y + args.block_size, coord_max[1]) s_y = e_y - args.block_size point_idxs = np.where((points[:, 0] >= s_x - 1e-8) & (points[:, 0] <= e_x + 1e-8) & (points[:, 1] >= s_y - 1e-8) & (points[:, 1] <= e_y + 1e-8))[0] if point_idxs.size == 0: continue num_batch = int(np.ceil(point_idxs.size / args.num_point)) point_size = int(num_batch * args.num_point) replace = False if (point_size - point_idxs.size <= point_idxs.size) else True point_idxs_repeat = np.random.choice(point_idxs, point_size - point_idxs.size, replace=replace) point_idxs = np.concatenate((point_idxs, point_idxs_repeat)) np.random.shuffle(point_idxs) data_batch = points[point_idxs, :] normlized_xyz = np.zeros((point_size, 3)) normlized_xyz[:, 0] = data_batch[:, 0] / coord_max[0] normlized_xyz[:, 1] = data_batch[:, 1] / coord_max[1] normlized_xyz[:, 2] = data_batch[:, 2] / coord_max[2] data_batch[:, 0] = data_batch[:, 0] - (s_x + args.block_size / 2.0) data_batch[:, 1] = data_batch[:, 1] - (s_y + args.block_size / 2.0) data_batch[:, 3:6] /= 255.0 data_batch = np.concatenate((data_batch, normlized_xyz), axis=1) label_batch = labels[point_idxs] data_room = np.vstack([data_room, data_batch]) if data_room.size else data_batch label_room = np.hstack([label_room, label_batch]) if label_room.size else label_batch index_room = np.hstack([index_room, point_idxs]) if index_room.size else point_idxs assert np.unique(index_room).size == labels.size return data_room, label_room, index_room, labels def test(model, criterion, names): logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>') batch_time = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() target_meter = AverageMeter() model.eval() rooms = sorted(os.listdir(args.train_full_folder)) rooms_split = [room for room in rooms if 'Area_{}'.format(args.test_area) in room] gt_all, pred_all = np.array([]), np.array([]) check_makedirs(args.save_folder) pred_save, gt_save = [], [] for idx, room_name in enumerate(rooms_split): data_room, label_room, index_room, gt = data_prepare(os.path.join(args.train_full_folder, room_name)) batch_point = args.num_point * args.test_batch_size batch_num = int(np.ceil(label_room.size / batch_point)) end = time.time() output_room = np.array([]) for i in range(batch_num): s_i, e_i = i * batch_point, min((i + 1) * batch_point, label_room.size) input, target, index = data_room[s_i:e_i, :], label_room[s_i:e_i], index_room[s_i:e_i] input = torch.from_numpy(input).float().view(-1, args.num_point, input.shape[1]) target = torch.from_numpy(target).long().view(-1, args.num_point) with torch.no_grad(): output = model(input.cuda()) loss = criterion(output, target.cuda()) # for reference output = output.transpose(1, 2).contiguous().view(-1, args.classes).data.cpu().numpy() pred = np.argmax(output, axis=1) intersection, union, target = intersectionAndUnion(pred, target.view(-1).data.cpu().numpy(), args.classes, args.ignore_label) accuracy = sum(intersection) / (sum(target) + 1e-10) output_room = np.vstack([output_room, output]) if output_room.size else output batch_time.update(time.time() - end) end = time.time() if ((i + 1) % args.print_freq == 0) or (i + 1 == batch_num): logger.info('Test: [{}/{}]-[{}/{}] ' 'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss {loss:.4f} ' 'Accuracy {accuracy:.4f} ' 'Points {gt.size}.'.format(idx + 1, len(rooms_split), i + 1, batch_num, batch_time=batch_time, loss=loss, accuracy=accuracy, gt=gt)) ''' unq, unq_inv, unq_cnt = np.unique(index_room, return_inverse=True, return_counts=True) index_array = np.split(np.argsort(unq_inv), np.cumsum(unq_cnt[:-1])) output_room = np.vstack([output_room, np.zeros((1, args.classes))]) index_array_fill = np.array(list(itertools.zip_longest(*index_array, fillvalue=output_room.shape[0] - 1))).T pred = output_room[index_array_fill].sum(1) pred = np.argmax(pred, axis=1) ''' pred = np.zeros((gt.size, args.classes)) for j in range(len(index_room)): pred[index_room[j]] += output_room[j] pred = np.argmax(pred, axis=1) # calculation 1: add per room predictions intersection, union, target = intersectionAndUnion(pred, gt, args.classes, args.ignore_label) intersection_meter.update(intersection) union_meter.update(union) target_meter.update(target) # calculation 2 pred_all = np.hstack([pred_all, pred]) if pred_all.size else pred gt_all = np.hstack([gt_all, gt]) if gt_all.size else gt pred_save.append(pred), gt_save.append(gt) with open(os.path.join(args.save_folder, "pred_{}.pickle".format(args.test_area)), 'wb') as handle: pickle.dump({'pred': pred_save}, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save_folder, "gt_{}.pickle".format(args.test_area)), 'wb') as handle: pickle.dump({'gt': gt_save}, handle, protocol=pickle.HIGHEST_PROTOCOL) # calculation 1 iou_class = intersection_meter.sum / (union_meter.sum + 1e-10) accuracy_class = intersection_meter.sum / (target_meter.sum + 1e-10) mIoU1 = np.mean(iou_class) mAcc1 = np.mean(accuracy_class) allAcc1 = sum(intersection_meter.sum) / (sum(target_meter.sum) + 1e-10) # calculation 2 intersection, union, target = intersectionAndUnion(pred_all, gt_all, args.classes, args.ignore_label) iou_class = intersection / (union + 1e-10) accuracy_class = intersection / (target + 1e-10) mIoU = np.mean(iou_class) mAcc = np.mean(accuracy_class) allAcc = sum(intersection) / (sum(target) + 1e-10) logger.info('Val result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(mIoU, mAcc, allAcc)) logger.info('Val1 result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(mIoU1, mAcc1, allAcc1)) for i in range(args.classes): logger.info('Class_{} Result: iou/accuracy {:.4f}/{:.4f}, name: {}.'.format(i, iou_class[i], accuracy_class[i], names[i])) logger.info('<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<') return mIoU, mAcc, allAcc, pred_all if __name__ == '__main__': main() ================================================ FILE: tool/test_scannet.py ================================================ import os import time import random import numpy as np import logging import pickle import argparse import torch import torch.nn as nn import torch.nn.parallel import torch.optim import torch.utils.data from util import config from util.util import AverageMeter, intersectionAndUnion, check_makedirs random.seed(123) np.random.seed(123) def get_parser(): parser = argparse.ArgumentParser(description='PyTorch Point Cloud Classification / Semantic Segmentation') parser.add_argument('--config', type=str, default='config/scannet/scannet_pointweb.yaml', help='config file') parser.add_argument('opts', help='see config/scannet/scannet_pointweb.yaml for all options', default=None, nargs=argparse.REMAINDER) args = parser.parse_args() assert args.config is not None cfg = config.load_cfg_from_cfg_file(args.config) if args.opts is not None: cfg = config.merge_cfg_from_list(cfg, args.opts) return cfg def get_logger(): logger_name = "main-logger" logger = logging.getLogger(logger_name) logger.setLevel(logging.INFO) handler = logging.StreamHandler() fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s" handler.setFormatter(logging.Formatter(fmt)) logger.addHandler(handler) return logger def main(): global args, logger args = get_parser() logger = get_logger() logger.info(args) assert args.classes > 1 logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) if args.arch == 'pointnet_seg': from model.pointnet.pointnet import PointNetSeg as Model elif args.arch == 'pointnet2_seg': from model.pointnet2.pointnet2_seg import PointNet2SSGSeg as Model elif args.arch == 'pointweb_seg': from model.pointweb.pointweb_seg import PointWebSeg as Model else: raise Exception('architecture not supported yet'.format(args.arch)) model = Model(c=args.fea_dim, k=args.classes, use_xyz=args.use_xyz) model = torch.nn.DataParallel(model.cuda()) logger.info(model) criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label).cuda() names = [line.rstrip('\n') for line in open(args.names_path)] if os.path.isfile(args.model_path): logger.info("=> loading checkpoint '{}'".format(args.model_path)) checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint['state_dict'], strict=False) logger.info("=> loaded checkpoint '{}'".format(args.model_path)) else: raise RuntimeError("=> no checkpoint found at '{}'".format(args.model_path)) test(model, criterion, names) def data_prepare(points, labels): coord_min, coord_max = np.amin(points, axis=0)[:3], np.amax(points, axis=0)[:3] stride = args.block_size * args.stride_rate grid_x = int(np.ceil(float(coord_max[0] - coord_min[0] - args.block_size) / stride) + 1) grid_y = int(np.ceil(float(coord_max[1] - coord_min[1] - args.block_size) / stride) + 1) data_room, label_room, index_room = np.array([]), np.array([]), np.array([]) for index_y in range(0, grid_y): for index_x in range(0, grid_x): s_x = coord_min[0] + index_x * stride e_x = min(s_x + args.block_size, coord_max[0]) s_x = e_x - args.block_size s_y = coord_min[1] + index_y * stride e_y = min(s_y + args.block_size, coord_max[1]) s_y = e_y - args.block_size point_idxs = np.where((points[:, 0] >= s_x - 1e-8) & (points[:, 0] <= e_x + 1e-8) & (points[:, 1] >= s_y - 1e-8) & (points[:, 1] <= e_y + 1e-8))[0] if point_idxs.size == 0: continue num_batch = int(np.ceil(point_idxs.size / args.num_point)) point_size = int(num_batch * args.num_point) replace = False if (point_size - point_idxs.size <= point_idxs.size) else True point_idxs_repeat = np.random.choice(point_idxs, point_size - point_idxs.size, replace=replace) point_idxs = np.concatenate((point_idxs, point_idxs_repeat)) np.random.shuffle(point_idxs) data_batch = points[point_idxs, :] normlized_xyz = np.zeros((point_size, 3)) normlized_xyz[:, 0] = data_batch[:, 0] / coord_max[0] normlized_xyz[:, 1] = data_batch[:, 1] / coord_max[1] normlized_xyz[:, 2] = data_batch[:, 2] / coord_max[2] data_batch[:, 0] = data_batch[:, 0] - (s_x + args.block_size / 2.0) data_batch[:, 1] = data_batch[:, 1] - (s_y + args.block_size / 2.0) data_batch = np.concatenate((data_batch, normlized_xyz), axis=1) label_batch = labels[point_idxs] data_room = np.vstack([data_room, data_batch]) if data_room.size else data_batch label_room = np.hstack([label_room, label_batch]) if label_room.size else label_batch index_room = np.hstack([index_room, point_idxs]) if index_room.size else point_idxs assert np.unique(index_room).size == labels.size return data_room, label_room, index_room def test(model, criterion, names): logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>') batch_time = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() target_meter = AverageMeter() model.eval() data_file = os.path.join(args.data_root, 'scannet_{}.pickle'.format(args.split)) file_pickle = open(data_file, 'rb') xyz_all = pickle.load(file_pickle, encoding='latin1') label_all = pickle.load(file_pickle, encoding='latin1') file_pickle.close() gt_all, pred_all = np.array([]), np.array([]) vox_acc = [] check_makedirs(args.save_folder) pred_save, gt_save = [], [] for idx in range(len(xyz_all)): points, labels = xyz_all[idx], label_all[idx].astype(np.int32) gt = labels - 1 gt[labels == 0] = 255 data_room, label_room, index_room = data_prepare(points, gt) batch_point = args.num_point * args.test_batch_size batch_num = int(np.ceil(label_room.size / batch_point)) end = time.time() output_room = np.array([]) for i in range(batch_num): s_i, e_i = i * batch_point, min((i + 1) * batch_point, label_room.size) input, target, index = data_room[s_i:e_i, :], label_room[s_i:e_i], index_room[s_i:e_i] input = torch.from_numpy(input).float().view(-1, args.num_point, input.shape[1]) target = torch.from_numpy(target).long().view(-1, args.num_point) with torch.no_grad(): output = model(input.cuda()) loss = criterion(output, target.cuda()) # for reference output = output.transpose(1, 2).contiguous().view(-1, args.classes).data.cpu().numpy() pred = np.argmax(output, axis=1) intersection, union, target = intersectionAndUnion(pred, target.view(-1).data.cpu().numpy(), args.classes, args.ignore_label) accuracy = sum(intersection) / (sum(target) + 1e-10) output_room = np.vstack([output_room, output]) if output_room.size else output batch_time.update(time.time() - end) end = time.time() if ((i + 1) % args.print_freq == 0) or (i + 1 == batch_num): logger.info('Test: [{}/{}]-[{}/{}] ' 'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss {loss:.4f} ' 'Accuracy {accuracy:.4f} ' 'Points {gt.size}.'.format(idx + 1, len(xyz_all), i + 1, batch_num, batch_time=batch_time, loss=loss, accuracy=accuracy, gt=gt)) pred = np.zeros((gt.size, args.classes)) for j in range(len(index_room)): pred[index_room[j]] += output_room[j] pred = np.argmax(pred, axis=1) # calculation 1: add per room predictions intersection, union, target = intersectionAndUnion(pred, gt, args.classes, args.ignore_label) intersection_meter.update(intersection) union_meter.update(union) target_meter.update(target) # calculation 2 pred_all = np.hstack([pred_all, pred]) if pred_all.size else pred gt_all = np.hstack([gt_all, gt]) if gt_all.size else gt pred_save.append(pred), gt_save.append(gt) # compute voxel accuracy (follow scannet, pointnet++ and pointcnn) res = 0.0484 coord_min, coord_max = np.min(points, axis=0), np.max(points, axis=0) nvox = np.ceil((coord_max - coord_min) / res) vidx = np.ceil((points - coord_min) / res) vidx = vidx[:, 0] + vidx[:, 1] * nvox[0] + vidx[:, 2] * nvox[0] * nvox[1] uvidx, vpidx = np.unique(vidx, return_index=True) # compute voxel label uvlabel = np.array(gt)[vpidx] uvpred = np.array(pred)[vpidx] # compute voxel accuracy (ignore label 0 which is scannet unannotated) c_accvox = np.sum(np.equal(uvpred, uvlabel)) c_ignore = np.sum(np.equal(uvlabel, 255)) vox_acc.append([c_accvox, len(uvlabel) - c_ignore]) with open(os.path.join(args.save_folder, "pred_{}.pickle".format(args.split)), 'wb') as handle: pickle.dump({'pred': pred_save}, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save_folder, "gt_{}.pickle".format(args.split)), 'wb') as handle: pickle.dump({'gt': gt_save}, handle, protocol=pickle.HIGHEST_PROTOCOL) # calculation 1 iou_class = intersection_meter.sum / (union_meter.sum + 1e-10) accuracy_class = intersection_meter.sum / (target_meter.sum + 1e-10) mIoU1 = np.mean(iou_class) mAcc1 = np.mean(accuracy_class) allAcc1 = sum(intersection_meter.sum) / (sum(target_meter.sum) + 1e-10) # calculation 2 intersection, union, target = intersectionAndUnion(pred_all, gt_all, args.classes, args.ignore_label) iou_class = intersection / (union + 1e-10) accuracy_class = intersection / (target + 1e-10) mIoU = np.mean(iou_class) mAcc = np.mean(accuracy_class) allAcc = sum(intersection) / (sum(target) + 1e-10) # compute avg voxel acc vox_acc = np.sum(vox_acc, 0) voxAcc = vox_acc[0] * 1.0 / vox_acc[1] logger.info('Val result: mIoU/mAcc/allAcc/voxAcc {:.4f}/{:.4f}/{:.4f}/{:.4f}.'.format(mIoU, mAcc, allAcc, voxAcc)) logger.info('Val111 result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}/{:.4f}.'.format(mIoU1, mAcc1, allAcc1, voxAcc)) for i in range(args.classes): logger.info('Class_{} Result: iou/accuracy {:.4f}/{:.4f}, name: {}.'.format(i, iou_class[i], accuracy_class[i], names[i])) logger.info('<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<') return mIoU, mAcc, allAcc, pred_all if __name__ == '__main__': main() ================================================ FILE: tool/train.py ================================================ import os import time import random import numpy as np import logging import argparse import torch import torch.backends.cudnn as cudnn import torch.nn as nn import torch.nn.parallel import torch.optim import torch.utils.data import torch.optim.lr_scheduler as lr_scheduler from tensorboardX import SummaryWriter from util import dataset, transform, config from util.s3dis import S3DIS from util.scannet import ScanNet from util.util import AverageMeter, intersectionAndUnionGPU def get_parser(): parser = argparse.ArgumentParser(description='PyTorch Point Cloud Semantic Segmentation') parser.add_argument('--config', type=str, default='config/s3dis/s3dis_pointweb.yaml', help='config file') parser.add_argument('opts', help='see config/s3dis/s3dis_pointweb.yaml for all options', default=None, nargs=argparse.REMAINDER) args = parser.parse_args() assert args.config is not None cfg = config.load_cfg_from_cfg_file(args.config) if args.opts is not None: cfg = config.merge_cfg_from_list(cfg, args.opts) return cfg def get_logger(): logger_name = "main-logger" logger = logging.getLogger(logger_name) logger.setLevel(logging.INFO) handler = logging.StreamHandler() fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s" handler.setFormatter(logging.Formatter(fmt)) logger.addHandler(handler) return logger def worker_init_fn(worker_id): random.seed(args.manual_seed + worker_id) def init(): global args, logger, writer args = get_parser() logger = get_logger() writer = SummaryWriter(args.save_path) os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu) if args.manual_seed is not None: cudnn.benchmark = False cudnn.deterministic = True torch.manual_seed(args.manual_seed) np.random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) torch.cuda.manual_seed_all(args.manual_seed) if len(args.train_gpu) == 1: args.sync_bn = False logger.info(args) def main(): init() if args.arch == 'pointnet_seg': from model.pointnet.pointnet import PointNetSeg as Model elif args.arch == 'pointnet2_seg': from model.pointnet2.pointnet2_seg import PointNet2SSGSeg as Model elif args.arch == 'pointweb_seg': from model.pointweb.pointweb_seg import PointWebSeg as Model else: raise Exception('architecture not supported yet'.format(args.arch)) model = Model(c=args.fea_dim, k=args.classes, use_xyz=args.use_xyz) if args.sync_bn: from util.util import convert_to_syncbn convert_to_syncbn(model) criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=args.step_epoch, gamma=args.multiplier) logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) logger.info(model) model = torch.nn.DataParallel(model.cuda()) if args.sync_bn: from lib.sync_bn import patch_replication_callback patch_replication_callback(model) if args.weight: if os.path.isfile(args.weight): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight) model.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded weight '{}'".format(args.weight)) else: logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: if os.path.isfile(args.resume): logger.info("=> loading checkpoint '{}'".format(args.resume)) # checkpoint = torch.load(args.resume) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda()) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: logger.info("=> no checkpoint found at '{}'".format(args.resume)) train_transform = transform.Compose([transform.ToTensor()]) if args.data_name == 's3dis': train_data = S3DIS(split='train', data_root=args.train_full_folder, num_point=args.num_point, test_area=args.test_area, block_size=args.block_size, sample_rate=args.sample_rate, transform=train_transform) # train_data = dataset.PointData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform) elif args.data_name == 'scannet': train_data = ScanNet(split='train', data_root=args.data_root, num_point=args.num_point, block_size=args.block_size, sample_rate=args.sample_rate, transform=train_transform) elif args.data_name == 'modelnet40': train_data = dataset.PointData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform, num_point=args.num_point, random_index=True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True, num_workers=args.train_workers, pin_memory=True) val_loader = None if args.evaluate: val_transform = transform.Compose([transform.ToTensor()]) val_data = dataset.PointData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.train_batch_size_val, shuffle=False, num_workers=args.train_workers, pin_memory=True) for epoch in range(args.start_epoch, args.epochs): scheduler.step() loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, criterion, optimizer, epoch) epoch_log = epoch + 1 writer.add_scalar('loss_train', loss_train, epoch_log) writer.add_scalar('mIoU_train', mIoU_train, epoch_log) writer.add_scalar('mAcc_train', mAcc_train, epoch_log) writer.add_scalar('allAcc_train', allAcc_train, epoch_log) if epoch_log % args.save_freq == 0: filename = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth' logger.info('Saving checkpoint to: ' + filename) torch.save({'epoch': epoch_log, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict()}, filename) if epoch_log / args.save_freq > 2: deletename = args.save_path + '/train_epoch_' + str(epoch_log - args.save_freq * 2) + '.pth' os.remove(deletename) if args.evaluate: loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion) writer.add_scalar('loss_val', loss_val, epoch_log) writer.add_scalar('mIoU_val', mIoU_val, epoch_log) writer.add_scalar('mAcc_val', mAcc_val, epoch_log) writer.add_scalar('allAcc_val', allAcc_val, epoch_log) def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() target_meter = AverageMeter() model.train() end = time.time() max_iter = args.epochs * len(train_loader) for i, (input, target) in enumerate(train_loader): data_time.update(time.time() - end) input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = model(input) if target.shape[-1] == 1: target = target[:, 0] # for cls loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() output = output.max(1)[1] intersection, union, target = intersectionAndUnionGPU(output, target, args.classes, args.ignore_label) intersection, union, target = intersection.cpu().numpy(), union.cpu().numpy(), target.cpu().numpy() intersection_meter.update(intersection), union_meter.update(union), target_meter.update(target) accuracy = sum(intersection_meter.val) / (sum(target_meter.val) + 1e-10) loss_meter.update(loss.item(), input.size(0)) batch_time.update(time.time() - end) end = time.time() # calculate remain time current_iter = epoch * len(train_loader) + i + 1 remain_iter = max_iter - current_iter remain_time = remain_iter * batch_time.avg t_m, t_s = divmod(remain_time, 60) t_h, t_m = divmod(t_m, 60) remain_time = '{:02d}:{:02d}:{:02d}'.format(int(t_h), int(t_m), int(t_s)) if (i + 1) % args.print_freq == 0: logger.info('Epoch: [{}/{}][{}/{}] ' 'Data {data_time.val:.3f} ({data_time.avg:.3f}) ' 'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Remain {remain_time} ' 'Loss {loss_meter.val:.4f} ' 'Accuracy {accuracy:.4f}.'.format(epoch+1, args.epochs, i + 1, len(train_loader), batch_time=batch_time, data_time=data_time, remain_time=remain_time, loss_meter=loss_meter, accuracy=accuracy)) writer.add_scalar('loss_train_batch', loss_meter.val, current_iter) writer.add_scalar('mIoU_train_batch', np.mean(intersection / (union + 1e-10)), current_iter) writer.add_scalar('mAcc_train_batch', np.mean(intersection / (target + 1e-10)), current_iter) writer.add_scalar('allAcc_train_batch', accuracy, current_iter) iou_class = intersection_meter.sum / (union_meter.sum + 1e-10) accuracy_class = intersection_meter.sum / (target_meter.sum + 1e-10) mIoU = np.mean(iou_class) mAcc = np.mean(accuracy_class) allAcc = sum(intersection_meter.sum) / (sum(target_meter.sum) + 1e-10) logger.info('Train result at epoch [{}/{}]: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(epoch+1, args.epochs, mIoU, mAcc, allAcc)) return loss_meter.avg, mIoU, mAcc, allAcc def validate(val_loader, model, criterion): logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>') batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() target_meter = AverageMeter() model.eval() end = time.time() for i, (input, target) in enumerate(val_loader): data_time.update(time.time() - end) input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) if target.shape[-1] == 1: target = target[:, 0] # for cls output = model(input) loss = criterion(output, target) output = output.max(1)[1] intersection, union, target = intersectionAndUnionGPU(output, target, args.classes, args.ignore_label) intersection, union, target = intersection.cpu().numpy(), union.cpu().numpy(), target.cpu().numpy() intersection_meter.update(intersection), union_meter.update(union), target_meter.update(target) accuracy = sum(intersection_meter.val) / (sum(target_meter.val) + 1e-10) loss_meter.update(loss.item(), input.size(0)) batch_time.update(time.time() - end) end = time.time() if (i + 1) % args.print_freq == 0: logger.info('Test: [{}/{}] ' 'Data {data_time.val:.3f} ({data_time.avg:.3f}) ' 'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f}) ' 'Accuracy {accuracy:.4f}.'.format(i + 1, len(val_loader), data_time=data_time, batch_time=batch_time, loss_meter=loss_meter, accuracy=accuracy)) iou_class = intersection_meter.sum / (union_meter.sum + 1e-10) accuracy_class = intersection_meter.sum / (target_meter.sum + 1e-10) mIoU = np.mean(iou_class) mAcc = np.mean(accuracy_class) allAcc = sum(intersection_meter.sum) / (sum(target_meter.sum) + 1e-10) logger.info('Val result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(mIoU, mAcc, allAcc)) for i in range(args.classes): logger.info('Class_{} Result: iou/accuracy {:.4f}/{:.4f}.'.format(i, iou_class[i], accuracy_class[i])) logger.info('<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<') return loss_meter.avg, mIoU, mAcc, allAcc if __name__ == '__main__': main() ================================================ FILE: tool/train.sh ================================================ #!/bin/sh export PYTHONPATH=./ PYTHON=python dataset=$1 exp_name=$2 exp_dir=exp/${dataset}/${exp_name} model_dir=${exp_dir}/model config=config/${dataset}/${dataset}_${exp_name}.yaml mkdir -p ${model_dir} now=$(date +"%Y%m%d_%H%M%S") cp tool/train.sh tool/train.py ${config} ${exp_dir} $PYTHON tool/train.py --config=${config} 2>&1 | tee ${model_dir}/train-$now.log if [ ${dataset} = 's3dis' ] then $PYTHON tool/test_s3dis.py --config=${config} 2>&1 | tee ${model_dir}/test-$now.log elif [ ${dataset} = 'scannet' ] then $PYTHON tool/test_scannet.py --config=${config} 2>&1 | tee ${model_dir}/test-$now.log fi ================================================ FILE: util/config.py ================================================ # ----------------------------------------------------------------------------- # Functions for parsing args # ----------------------------------------------------------------------------- import yaml import os from ast import literal_eval import copy class CfgNode(dict): """ CfgNode represents an internal node in the configuration tree. It's a simple dict-like container that allows for attribute-based access to keys. """ def __init__(self, init_dict=None, key_list=None, new_allowed=False): # Recursively convert nested dictionaries in init_dict into CfgNodes init_dict = {} if init_dict is None else init_dict key_list = [] if key_list is None else key_list for k, v in init_dict.items(): if type(v) is dict: # Convert dict to CfgNode init_dict[k] = CfgNode(v, key_list=key_list + [k]) super(CfgNode, self).__init__(init_dict) def __getattr__(self, name): if name in self: return self[name] else: raise AttributeError(name) def __setattr__(self, name, value): self[name] = value def __str__(self): def _indent(s_, num_spaces): s = s_.split("\n") if len(s) == 1: return s_ first = s.pop(0) s = [(num_spaces * " ") + line for line in s] s = "\n".join(s) s = first + "\n" + s return s r = "" s = [] for k, v in sorted(self.items()): seperator = "\n" if isinstance(v, CfgNode) else " " attr_str = "{}:{}{}".format(str(k), seperator, str(v)) attr_str = _indent(attr_str, 2) s.append(attr_str) r += "\n".join(s) return r def __repr__(self): return "{}({})".format(self.__class__.__name__, super(CfgNode, self).__repr__()) def load_cfg_from_cfg_file(file): cfg = {} assert os.path.isfile(file) and file.endswith('.yaml'), \ '{} is not a yaml file'.format(file) with open(file, 'r') as f: cfg_from_file = yaml.safe_load(f) for key in cfg_from_file: for k, v in cfg_from_file[key].items(): cfg[k] = v cfg = CfgNode(cfg) return cfg def merge_cfg_from_list(cfg, cfg_list): new_cfg = copy.deepcopy(cfg) assert len(cfg_list) % 2 == 0 for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]): subkey = full_key.split('.')[-1] assert subkey in cfg, 'Non-existent key: {}'.format(full_key) value = _decode_cfg_value(v) value = _check_and_coerce_cfg_value_type( value, cfg[subkey], subkey, full_key ) setattr(new_cfg, subkey, value) return new_cfg def _decode_cfg_value(v): """Decodes a raw config value (e.g., from a yaml config files or command line argument) into a Python object. """ # All remaining processing is only applied to strings if not isinstance(v, str): return v # Try to interpret `v` as a: # string, number, tuple, list, dict, boolean, or None try: v = literal_eval(v) # The following two excepts allow v to pass through when it represents a # string. # # Longer explanation: # The type of v is always a string (before calling literal_eval), but # sometimes it *represents* a string and other times a data structure, like # a list. In the case that v represents a string, what we got back from the # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is # ok with '"foo"', but will raise a ValueError if given 'foo'. In other # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval # will raise a SyntaxError. except ValueError: pass except SyntaxError: pass return v def _check_and_coerce_cfg_value_type(replacement, original, key, full_key): """Checks that `replacement`, which is intended to replace `original` is of the right type. The type is correct if it matches exactly or is one of a few cases in which the type can be easily coerced. """ original_type = type(original) replacement_type = type(replacement) # The types must match (with some exceptions) if replacement_type == original_type: return replacement # Cast replacement from from_type to to_type if the replacement and original # types match from_type and to_type def conditional_cast(from_type, to_type): if replacement_type == from_type and original_type == to_type: return True, to_type(replacement) else: return False, None # Conditionally casts # list <-> tuple casts = [(tuple, list), (list, tuple)] # For py2: allow converting from str (bytes) to a unicode string try: casts.append((str, unicode)) # noqa: F821 except Exception: pass for (from_type, to_type) in casts: converted, converted_value = conditional_cast(from_type, to_type) if converted: return converted_value raise ValueError( "Type mismatch ({} vs. {}) with values ({} vs. {}) for config " "key: {}".format( original_type, replacement_type, original, replacement, full_key ) ) def _assert_with_logging(cond, msg): if not cond: logger.debug(msg) assert cond, msg ================================================ FILE: util/dataset.py ================================================ import os import h5py import numpy as np from torch.utils.data import Dataset def make_dataset(split='train', data_root=None, data_list=None): if not os.path.isfile(data_list): raise (RuntimeError("Point list file do not exist: " + data_list + "\n")) point_list = [] list_read = open(data_list).readlines() print("Totally {} samples in {} set.".format(len(list_read), split)) for line in list_read: point_list.append(os.path.join(data_root, line.strip())) return point_list class PointData(Dataset): def __init__(self, split='train', data_root=None, data_list=None, transform=None, num_point=None, random_index=False): assert split in ['train', 'val', 'test'] self.split = split self.data_list = make_dataset(split, data_root, data_list) self.transform = transform self.num_point = num_point self.random_index = random_index def __len__(self): return len(self.data_list) def __getitem__(self, index): data_path = self.data_list[index] f = h5py.File(data_path, 'r') data = f['data'][:] if self.split is 'test': label = 255 # place holder else: label = f['label'][:] f.close() if self.num_point is None: self.num_point = data.shape[0] idxs = np.arange(data.shape[0]) if self.random_index: np.random.shuffle(idxs) idxs = idxs[0:self.num_point] data = data[idxs, :] if label.size != 1: # seg data label = label[idxs] if self.transform is not None: data, label = self.transform(data, label) return data, label if __name__ == '__main__': data_root = '/mnt/sda1/hszhao/dataset/3d/s3dis' data_list = '/mnt/sda1/hszhao/dataset/3d/s3dis/list/train12346.txt' point_data = PointData('train', data_root, data_list) print('point data size:', point_data.__len__()) print('point data 0 shape:', point_data.__getitem__(0)[0].shape) print('point label 0 shape:', point_data.__getitem__(0)[1].shape) ================================================ FILE: util/pt_util.py ================================================ import shutil, os import tqdm from itertools import repeat import numpy as np from typing import List, Tuple # from scipy.stats import t as student_t # import statistics as stats import torch import torch.nn as nn from torch.autograd.function import InplaceFunction BN1d, BN2d, BN3d = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d class SharedMLP(nn.Sequential): def __init__( self, args: List[int], *, bn: bool = False, activation=nn.ReLU(inplace=True), preact: bool = False, first: bool = False, name: str = "" ): super().__init__() for i in range(len(args) - 1): self.add_module( name + 'layer{}'.format(i), Conv2d( args[i], args[i + 1], bn=(not first or not preact or (i != 0)) and bn, activation=activation if (not first or not preact or (i != 0)) else None, preact=preact ) ) class _BNBase(nn.Sequential): def __init__(self, in_size, batch_norm=None, name=""): super().__init__() self.add_module(name + "bn", batch_norm(in_size)) nn.init.constant_(self[0].weight, 1.0) nn.init.constant_(self[0].bias, 0) class BatchNorm1d(_BNBase): def __init__(self, in_size: int, *, name: str = ""): super().__init__(in_size, batch_norm=BN1d, name=name) class BatchNorm2d(_BNBase): def __init__(self, in_size: int, name: str = ""): super().__init__(in_size, batch_norm=BN2d, name=name) class BatchNorm3d(_BNBase): def __init__(self, in_size: int, name: str = ""): super().__init__(in_size, batch_norm=BN3d, name=name) class _ConvBase(nn.Sequential): def __init__( self, in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=None, batch_norm=None, bias=True, preact=False, name="" ): super().__init__() bias = bias and (not bn) conv_unit = conv( in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias ) init(conv_unit.weight) if bias: nn.init.constant_(conv_unit.bias, 0) if bn: if not preact: bn_unit = batch_norm(out_size) else: bn_unit = batch_norm(in_size) if preact: if bn: self.add_module(name + 'bn', bn_unit) if activation is not None: self.add_module(name + 'activation', activation) self.add_module(name + 'conv', conv_unit) if not preact: if bn: self.add_module(name + 'bn', bn_unit) if activation is not None: self.add_module(name + 'activation', activation) class Conv1d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: int = 1, stride: int = 1, padding: int = 0, activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv1d, batch_norm=BatchNorm1d, bias=bias, preact=preact, name=name ) class Conv2d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: Tuple[int, int] = (1, 1), stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv2d, batch_norm=BatchNorm2d, bias=bias, preact=preact, name=name ) class Conv3d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: Tuple[int, int, int] = (1, 1, 1), stride: Tuple[int, int, int] = (1, 1, 1), padding: Tuple[int, int, int] = (0, 0, 0), activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv3d, batch_norm=BatchNorm3d, bias=bias, preact=preact, name=name ) class FC(nn.Sequential): def __init__( self, in_size: int, out_size: int, *, activation=nn.ReLU(inplace=True), bn: bool = False, init=None, preact: bool = False, name: str = "" ): super().__init__() fc = nn.Linear(in_size, out_size, bias=not bn) if init is not None: init(fc.weight) if not bn: nn.init.constant_(fc.bias, 0) if preact: if bn: self.add_module(name + 'bn', BatchNorm1d(in_size)) if activation is not None: self.add_module(name + 'activation', activation) self.add_module(name + 'fc', fc) if not preact: if bn: self.add_module(name + 'bn', BatchNorm1d(out_size)) if activation is not None: self.add_module(name + 'activation', activation) class _DropoutNoScaling(InplaceFunction): @staticmethod def _make_noise(input): return input.new().resize_as_(input) @staticmethod def symbolic(g, input, p=0.5, train=False, inplace=False): if inplace: return None n = g.appendNode( g.create("Dropout", [input]).f_("ratio", p).i_("is_test", not train) ) real = g.appendNode(g.createSelect(n, 0)) g.appendNode(g.createSelect(n, 1)) return real @classmethod def forward(cls, ctx, input, p=0.5, train=False, inplace=False): if p < 0 or p > 1: raise ValueError( "dropout probability has to be between 0 and 1, " "but got {}".format(p) ) ctx.p = p ctx.train = train ctx.inplace = inplace if ctx.inplace: ctx.mark_dirty(input) output = input else: output = input.clone() if ctx.p > 0 and ctx.train: ctx.noise = cls._make_noise(input) if ctx.p == 1: ctx.noise.fill_(0) else: ctx.noise.bernoulli_(1 - ctx.p) ctx.noise = ctx.noise.expand_as(input) output.mul_(ctx.noise) return output @staticmethod def backward(ctx, grad_output): if ctx.p > 0 and ctx.train: return grad_output.mul(ctx.noise), None, None, None else: return grad_output, None, None, None dropout_no_scaling = _DropoutNoScaling.apply class _FeatureDropoutNoScaling(_DropoutNoScaling): @staticmethod def symbolic(input, p=0.5, train=False, inplace=False): return None @staticmethod def _make_noise(input): return input.new().resize_( input.size(0), input.size(1), *repeat(1, input.dim() - 2) ) feature_dropout_no_scaling = _FeatureDropoutNoScaling.apply def group_model_params(model: nn.Module, **kwargs): decay_group = [] no_decay_group = [] for name, param in model.named_parameters(): if name.find("bn") != -1 or name.find("bias") != -1: no_decay_group.append(param) else: decay_group.append(param) assert len(list(model.parameters())) == len(decay_group) + len(no_decay_group) return [ dict(params=decay_group, **kwargs), dict(params=no_decay_group, weight_decay=0.0, **kwargs) ] def checkpoint_state( model=None, optimizer=None, best_prec=None, epoch=None, it=None ): optim_state = optimizer.state_dict() if optimizer is not None else None if model is not None: if isinstance(model, torch.nn.DataParallel): model_state = model.module.state_dict() else: model_state = model.state_dict() else: model_state = None return { 'epoch': epoch, 'it': it, 'best_prec': best_prec, 'model_state': model_state, 'optimizer_state': optim_state } def save_checkpoint( state, is_best, filename='checkpoint', bestname='model_best' ): filename = '{}.pth.tar'.format(filename) torch.save(state, filename) if is_best: shutil.copyfile(filename, '{}.pth.tar'.format(bestname)) def load_checkpoint(model=None, optimizer=None, filename='checkpoint'): filename = "{}.pth.tar".format(filename) if os.path.isfile(filename): print("==> Loading from checkpoint '{}'".format(filename)) checkpoint = torch.load(filename) epoch = checkpoint['epoch'] it = checkpoint.get('it', 0.0) best_prec = checkpoint['best_prec'] if model is not None and checkpoint['model_state'] is not None: model.load_state_dict(checkpoint['model_state']) if optimizer is not None and checkpoint['optimizer_state'] is not None: optimizer.load_state_dict(checkpoint['optimizer_state']) print("==> Done") else: print("==> Checkpoint '{}' not found".format(filename)) return it, epoch, best_prec def variable_size_collate(pad_val=0, use_shared_memory=True): import collections _numpy_type_map = { 'float64': torch.DoubleTensor, 'float32': torch.FloatTensor, 'float16': torch.HalfTensor, 'int64': torch.LongTensor, 'int32': torch.IntTensor, 'int16': torch.ShortTensor, 'int8': torch.CharTensor, 'uint8': torch.ByteTensor, } def wrapped(batch): "Puts each data field into a tensor with outer dimension batch size" error_msg = "batch must contain tensors, numbers, dicts or lists; found {}" elem_type = type(batch[0]) if torch.is_tensor(batch[0]): max_len = 0 for b in batch: max_len = max(max_len, b.size(0)) numel = sum([int(b.numel() / b.size(0) * max_len) for b in batch]) if use_shared_memory: # If we're in a background process, concatenate directly into a # shared memory tensor to avoid an extra copy storage = batch[0].storage()._new_shared(numel) out = batch[0].new(storage) else: out = batch[0].new(numel) out = out.view( len(batch), max_len, *[batch[0].size(i) for i in range(1, batch[0].dim())] ) out.fill_(pad_val) for i in range(len(batch)): out[i, 0:batch[i].size(0)] = batch[i] return out elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ and elem_type.__name__ != 'string_': elem = batch[0] if elem_type.__name__ == 'ndarray': # array of string classes and object if re.search('[SaUO]', elem.dtype.str) is not None: raise TypeError(error_msg.format(elem.dtype)) return wrapped([torch.from_numpy(b) for b in batch]) if elem.shape == (): # scalars py_type = float if elem.dtype.name.startswith('float') else int return _numpy_type_map[elem.dtype.name]( list(map(py_type, batch)) ) elif isinstance(batch[0], int): return torch.LongTensor(batch) elif isinstance(batch[0], float): return torch.DoubleTensor(batch) elif isinstance(batch[0], collections.Mapping): return {key: wrapped([d[key] for d in batch]) for key in batch[0]} elif isinstance(batch[0], collections.Sequence): transposed = zip(*batch) return [wrapped(samples) for samples in transposed] raise TypeError((error_msg.format(type(batch[0])))) return wrapped class TrainValSplitter(): r""" Creates a training and validation split to be used as the sampler in a pytorch DataLoader Parameters --------- numel : int Number of elements in the entire training dataset percent_train : float Percentage of data in the training split shuffled : bool Whether or not shuffle which data goes to which split """ def __init__( self, *, numel: int, percent_train: float, shuffled: bool = False ): indicies = np.array([i for i in range(numel)]) if shuffled: np.random.shuffle(indicies) self.train = torch.utils.data.sampler.SubsetRandomSampler( indicies[0:int(percent_train * numel)] ) self.val = torch.utils.data.sampler.SubsetRandomSampler( indicies[int(percent_train * numel):-1] ) ''' class CrossValSplitter(): r""" Class that creates cross validation splits. The train and val splits can be used in pytorch DataLoaders. The splits can be updated by calling next(self) or using a loop: for _ in self: .... Parameters --------- numel : int Number of elements in the training set k_folds : int Number of folds shuffled : bool Whether or not to shuffle which data goes in which fold """ def __init__(self, *, numel: int, k_folds: int, shuffled: bool = False): inidicies = np.array([i for i in range(numel)]) if shuffled: np.random.shuffle(inidicies) self.folds = np.array(np.array_split(inidicies, k_folds), dtype=object) self.current_v_ind = -1 self.val = torch.utils.data.sampler.SubsetRandomSampler(self.folds[0]) self.train = torch.utils.data.sampler.SubsetRandomSampler( np.concatenate(self.folds[1:], axis=0) ) self.metrics = {} def __iter__(self): self.current_v_ind = -1 return self def __len__(self): return len(self.folds) def __getitem__(self, idx): assert idx >= 0 and idx < len(self) self.val.inidicies = self.folds[idx] self.train.inidicies = np.concatenate( self.folds[np.arange(len(self)) != idx], axis=0 ) def __next__(self): self.current_v_ind += 1 if self.current_v_ind >= len(self): raise StopIteration self[self.current_v_ind] def update_metrics(self, to_post: dict): for k, v in to_post.items(): if k in self.metrics: self.metrics[k].append(v) else: self.metrics[k] = [v] def print_metrics(self): for name, samples in self.metrics.items(): xbar = stats.mean(samples) sx = stats.stdev(samples, xbar) tstar = student_t.ppf(1.0 - 0.025, len(samples) - 1) margin_of_error = tstar * sx / sqrt(len(samples)) print("{}: {} +/- {}".format(name, xbar, margin_of_error)) ''' def set_bn_momentum_default(bn_momentum): def fn(m): if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): m.momentum = bn_momentum return fn class BNMomentumScheduler(object): def __init__( self, model, bn_lambda, last_epoch=-1, setter=set_bn_momentum_default ): if not isinstance(model, nn.Module): raise RuntimeError( "Class '{}' is not a PyTorch nn Module".format( type(model).__name__ ) ) self.model = model self.setter = setter self.lmbd = bn_lambda self.step(last_epoch + 1) self.last_epoch = last_epoch def step(self, epoch=None): if epoch is None: epoch = self.last_epoch + 1 self.last_epoch = epoch self.model.apply(self.setter(self.lmbd(epoch))) class Trainer(object): r""" Reasonably generic trainer for pytorch models Parameters ---------- model : pytorch model Model to be trained model_fn : function (model, inputs, labels) -> preds, loss, accuracy optimizer : torch.optim Optimizer for model checkpoint_name : str Name of file to save checkpoints to best_name : str Name of file to save best model to lr_scheduler : torch.optim.lr_scheduler Learning rate scheduler. .step() will be called at the start of every epoch bnm_scheduler : BNMomentumScheduler Batchnorm momentum scheduler. .step() will be called at the start of every epoch eval_frequency : int How often to run an eval log_name : str Name of file to output tensorboard_logger to """ def __init__( self, model, model_fn, optimizer, checkpoint_name="ckpt", best_name="best", lr_scheduler=None, bnm_scheduler=None, eval_frequency=-1, viz=None ): self.model, self.model_fn, self.optimizer, self.lr_scheduler, self.bnm_scheduler = ( model, model_fn, optimizer, lr_scheduler, bnm_scheduler ) self.checkpoint_name, self.best_name = checkpoint_name, best_name self.eval_frequency = eval_frequency self.training_best, self.eval_best = {}, {} self.viz = viz @staticmethod def _decode_value(v): if isinstance(v[0], float): return np.mean(v) elif isinstance(v[0], tuple): if len(v[0]) == 3: num = [l[0] for l in v] denom = [l[1] for l in v] w = v[0][2] else: num = [l[0] for l in v] denom = [l[1] for l in v] w = None return np.average( np.sum(num, axis=0) / (np.sum(denom, axis=0) + 1e-6), weights=w ) else: raise AssertionError("Unknown type: {}".format(type(v))) def _train_it(self, it, batch): self.model.train() if self.lr_scheduler is not None: self.lr_scheduler.step(it) if self.bnm_scheduler is not None: self.bnm_scheduler.step(it) self.optimizer.zero_grad() _, loss, eval_res = self.model_fn(self.model, batch) loss.backward() self.optimizer.step() return eval_res def eval_epoch(self, d_loader): self.model.eval() eval_dict = {} total_loss = 0.0 count = 1.0 for i, data in tqdm.tqdm(enumerate(d_loader, 0), total=len(d_loader), leave=False, desc='val'): self.optimizer.zero_grad() _, loss, eval_res = self.model_fn(self.model, data, eval=True) total_loss += loss.item() count += 1 for k, v in eval_res.items(): if v is not None: eval_dict[k] = eval_dict.get(k, []) + [v] return total_loss / count, eval_dict def train( self, start_it, start_epoch, n_epochs, train_loader, test_loader=None, best_loss=0.0 ): r""" Call to begin training the model Parameters ---------- start_epoch : int Epoch to start at n_epochs : int Number of epochs to train for test_loader : torch.utils.data.DataLoader DataLoader of the test_data train_loader : torch.utils.data.DataLoader DataLoader of training data best_loss : float Testing loss of the best model """ eval_frequency = ( self.eval_frequency if self.eval_frequency > 0 else len(train_loader) ) it = start_it with tqdm.trange(start_epoch, n_epochs + 1, desc='epochs') as tbar, \ tqdm.tqdm(total=eval_frequency, leave=False, desc='train') as pbar: for epoch in tbar: for batch in train_loader: res = self._train_it(it, batch) it += 1 pbar.update() pbar.set_postfix(dict(total_it=it)) tbar.refresh() if self.viz is not None: self.viz.update('train', it, res) if (it % eval_frequency) == 0: pbar.close() if test_loader is not None: val_loss, res = self.eval_epoch(test_loader) if self.viz is not None: self.viz.update('val', it, res) is_best = val_loss < best_loss best_loss = min(best_loss, val_loss) save_checkpoint( checkpoint_state( self.model, self.optimizer, val_loss, epoch, it ), is_best, filename=self.checkpoint_name, bestname=self.best_name ) pbar = tqdm.tqdm( total=eval_frequency, leave=False, desc='train' ) pbar.set_postfix(dict(total_it=it)) return best_loss ================================================ FILE: util/s3dis.py ================================================ import os import numpy as np from torch.utils.data import Dataset class S3DIS(Dataset): def __init__(self, split='train', data_root='trainval_fullarea', num_point=4096, test_area=5, block_size=1.0, sample_rate=1.0, transform=None): super().__init__() self.num_point = num_point self.block_size = block_size self.transform = transform rooms = sorted(os.listdir(data_root)) rooms = [room for room in rooms if 'Area_' in room] if split == 'train': rooms_split = [room for room in rooms if not 'Area_{}'.format(test_area) in room] else: rooms_split = [room for room in rooms if 'Area_{}'.format(test_area) in room] self.room_points, self.room_labels = [], [] self.room_coord_min, self.room_coord_max = [], [] num_point_all = [] for room_name in rooms_split: room_path = os.path.join(data_root, room_name) room_data = np.load(room_path) # xyzrgbl, N*7 points, labels = room_data[:, 0:6], room_data[:, 6] # xyzrgb, N*6; l, N coord_min, coord_max = np.amin(points, axis=0)[:3], np.amax(points, axis=0)[:3] self.room_points.append(points), self.room_labels.append(labels) self.room_coord_min.append(coord_min), self.room_coord_max.append(coord_max) num_point_all.append(labels.size) sample_prob = num_point_all / np.sum(num_point_all) num_iter = int(np.sum(num_point_all) * sample_rate / num_point) room_idxs = [] for index in range(len(rooms_split)): room_idxs.extend([index] * int(round(sample_prob[index] * num_iter))) self.room_idxs = np.array(room_idxs) print("Totally {} samples in {} set.".format(len(self.room_idxs), split)) def __getitem__(self, idx): room_idx = self.room_idxs[idx] points = self.room_points[room_idx] # N * 6 labels = self.room_labels[room_idx] # N N_points = points.shape[0] while (True): center = points[np.random.choice(N_points)][:3] block_min = center - [self.block_size / 2.0, self.block_size / 2.0, 0] block_max = center + [self.block_size / 2.0, self.block_size / 2.0, 0] point_idxs = np.where((points[:, 0] >= block_min[0]) & (points[:, 0] <= block_max[0]) & (points[:, 1] >= block_min[1]) & (points[:, 1] <= block_max[1]))[0] if point_idxs.size > 1024: break if point_idxs.size >= self.num_point: selected_point_idxs = np.random.choice(point_idxs, self.num_point, replace=False) else: selected_point_idxs = np.random.choice(point_idxs, self.num_point, replace=True) # normalize selected_points = points[selected_point_idxs, :] # num_point * 6 current_points = np.zeros((self.num_point, 9)) # num_point * 9 current_points[:, 6] = selected_points[:, 0] / self.room_coord_max[room_idx][0] current_points[:, 7] = selected_points[:, 1] / self.room_coord_max[room_idx][1] current_points[:, 8] = selected_points[:, 2] / self.room_coord_max[room_idx][2] selected_points[:, 0] = selected_points[:, 0] - center[0] selected_points[:, 1] = selected_points[:, 1] - center[1] selected_points[:, 3:6] /= 255.0 current_points[:, 0:6] = selected_points current_labels = labels[selected_point_idxs] if self.transform is not None: current_points, current_labels = self.transform(current_points, current_labels) return current_points, current_labels def __len__(self): return len(self.room_idxs) if __name__ == '__main__': data_root = '/mnt/lustre/zhaohengshuang/dataset/s3dis/trainval_fullarea' num_point, test_area, block_size, sample_rate = 4096, 5, 1.0, 0.01 point_data = S3DIS(split='train', data_root=data_root, num_point=num_point, test_area=test_area, block_size=block_size, sample_rate=sample_rate, transform=None) print('point data size:', point_data.__len__()) print('point data 0 shape:', point_data.__getitem__(0)[0].shape) print('point label 0 shape:', point_data.__getitem__(0)[1].shape) import torch, time, random manual_seed = 123 random.seed(manual_seed) np.random.seed(manual_seed) torch.manual_seed(manual_seed) torch.cuda.manual_seed_all(manual_seed) def worker_init_fn(worker_id): random.seed(manual_seed + worker_id) train_loader = torch.utils.data.DataLoader(point_data, batch_size=16, shuffle=True, num_workers=16, pin_memory=True, worker_init_fn=worker_init_fn) for idx in range(4): end = time.time() for i, (input, target) in enumerate(train_loader): print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end)) end = time.time() ================================================ FILE: util/scannet.py ================================================ import pickle import os import numpy as np from torch.utils.data import Dataset class ScanNet(Dataset): def __init__(self, split='train', data_root='scannet', num_point=8192, classes=20, block_size=1.5, sample_rate=1.0, transform=None): self.split = split self.num_point = num_point self.block_size = block_size self.transform = transform data_file = os.path.join(data_root, 'scannet_{}.pickle'.format(split)) file_pickle = open(data_file, 'rb') xyz_all = pickle.load(file_pickle, encoding='latin1') label_all = pickle.load(file_pickle, encoding='latin1') file_pickle.close() self.label_all = [] # for change 0-20 to 0-19 + 255 self.room_coord_min, self.room_coord_max = [], [] num_point_all = [] label_weight = np.zeros(classes+1) for index in range(len(xyz_all)): xyz, label = xyz_all[index], label_all[index] # xyzrgb, N*6; l, N coord_min, coord_max = np.amin(xyz, axis=0)[:3], np.amax(xyz, axis=0)[:3] self.room_coord_min.append(coord_min), self.room_coord_max.append(coord_max) num_point_all.append(label.size) tmp, _ = np.histogram(label, range(classes + 2)) label_weight += tmp label_new = label - 1 label_new[label == 0] = 255 self.label_all.append(label_new.astype(np.uint8)) label_weight = label_weight[1:].astype(np.float32) label_weight = label_weight / label_weight.sum() label_weight = 1 / np.log(1.2 + label_weight) sample_prob = num_point_all / np.sum(num_point_all) num_iter = int(np.sum(num_point_all) * sample_rate / num_point) room_idxs = [] for index in range(len(xyz_all)): room_idxs.extend([index] * int(round(sample_prob[index] * num_iter))) self.room_idxs = np.array(room_idxs) self.xyz_all = xyz_all self.label_weight = label_weight print("Totally {} samples in {} set.".format(len(self.room_idxs), split)) def __getitem__(self, idx): room_idx = self.room_idxs[idx] points = self.xyz_all[room_idx] # N * 3 labels = self.label_all[room_idx] # N N_points = points.shape[0] for i in range(10): center = points[np.random.choice(N_points)][:3] block_min = center - [self.block_size / 2.0, self.block_size / 2.0, 0] block_max = center + [self.block_size / 2.0, self.block_size / 2.0, 0] block_min[2], block_max[2] = self.room_coord_min[room_idx][2], self.room_coord_max[room_idx][2] point_idxs = np.where((points[:, 0] >= block_min[0]) & (points[:, 0] <= block_max[0]) & (points[:, 1] >= block_min[1]) & (points[:, 1] <= block_max[1]))[0] if point_idxs.size == 0: continue vidx = np.ceil((points[point_idxs, :] - block_min) / (block_max - block_min) * [31.0, 31.0, 62.0]) vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 + vidx[:, 2]) if ((labels[point_idxs] != 255).sum() / point_idxs.size >= 0.7) and (vidx.size/31.0/31.0/62.0 >= 0.02): break if point_idxs.size >= self.num_point: selected_point_idxs = np.random.choice(point_idxs, self.num_point, replace=False) else: selected_point_idxs = np.random.choice(point_idxs, self.num_point, replace=True) # normalize selected_points = points[selected_point_idxs, :] # num_point * 3 current_points = np.zeros((self.num_point, 6)) # num_point * 6 current_points[:, 3] = selected_points[:, 0] / self.room_coord_max[room_idx][0] current_points[:, 4] = selected_points[:, 1] / self.room_coord_max[room_idx][1] current_points[:, 5] = selected_points[:, 2] / self.room_coord_max[room_idx][2] selected_points[:, 0] = selected_points[:, 0] - center[0] selected_points[:, 1] = selected_points[:, 1] - center[1] current_points[:, 0:3] = selected_points current_labels = labels[selected_point_idxs] if self.transform is not None: current_points, current_labels = self.transform(current_points, current_labels) return current_points, current_labels def __len__(self): return len(self.room_idxs) if __name__ == '__main__': data_root = '/mnt/sda1/hszhao/dataset/scannet' point_data = ScanNet(split='train', data_root=data_root, num_point=8192, transform=None) print('point data size:', point_data.__len__()) print('point data 0 shape:', point_data.__getitem__(0)[0].shape) print('point label 0 shape:', point_data.__getitem__(0)[1].shape) import torch, time, random manual_seed = 123 def worker_init_fn(worker_id): random.seed(manual_seed + worker_id) random.seed(manual_seed) np.random.seed(manual_seed) torch.manual_seed(manual_seed) torch.cuda.manual_seed_all(manual_seed) train_loader = torch.utils.data.DataLoader(point_data, batch_size=16, shuffle=True, num_workers=1, pin_memory=True, worker_init_fn=worker_init_fn) for idx in range(2): end = time.time() for i, (input, target) in enumerate(train_loader): print('time: {}/{}--{}'.format(i+1, len(train_loader), time.time() - end)) end = time.time() ================================================ FILE: util/transform.py ================================================ import numpy as np import torch class Compose(object): def __init__(self, transforms): self.transforms = transforms def __call__(self, data, label): for t in self.transforms: data, label = t(data, label) return data, label class ToTensor(object): def __call__(self, data, label): data = torch.from_numpy(data) if not isinstance(data, torch.FloatTensor): data = data.float() label = torch.from_numpy(label) if not isinstance(label, torch.LongTensor): label = label.long() return data, label class RandomRotate(object): def __init__(self, rotate_angle=None, along_z=False): self.rotate_angle = rotate_angle self.along_z = along_z def __call__(self, data, label): if self.rotate_angle is None: rotate_angle = np.random.uniform() * 2 * np.pi else: rotate_angle = self.rotate_angle cosval, sinval = np.cos(rotate_angle), np.sin(rotate_angle) if self.along_z: rotation_matrix = np.array([[cosval, sinval, 0], [-sinval, cosval, 0], [0, 0, 1]]) else: rotation_matrix = np.array([[cosval, 0, sinval], [0, 1, 0], [-sinval, 0, cosval]]) data[:, 0:3] = np.dot(data[:, 0:3], rotation_matrix) if data.shape[1] > 3: # use normal data[:, 3:6] = np.dot(data[:, 3:6], rotation_matrix) return data, label class RandomRotatePerturbation(object): def __init__(self, angle_sigma=0.06, angle_clip=0.18): self.angle_sigma = angle_sigma self.angle_clip = angle_clip def __call__(self, data, label): angles = np.clip(self.angle_sigma*np.random.randn(3), -self.angle_clip, self.angle_clip) Rx = np.array([[1, 0, 0], [0, np.cos(angles[0]), -np.sin(angles[0])], [0, np.sin(angles[0]), np.cos(angles[0])]]) Ry = np.array([[np.cos(angles[1]), 0, np.sin(angles[1])], [0, 1, 0], [-np.sin(angles[1]), 0, np.cos(angles[1])]]) Rz = np.array([[np.cos(angles[2]), -np.sin(angles[2]), 0], [np.sin(angles[2]), np.cos(angles[2]), 0], [0, 0, 1]]) R = np.dot(Rz, np.dot(Ry, Rx)) data[:, 0:3] = np.dot(data[:, 0:3], R) if data.shape[1] > 3: # use normal data[:, 3:6] = np.dot(data[:, 3:6], R) return data, label class RandomScale(object): def __init__(self, scale_low=0.8, scale_high=1.25): self.scale_low = scale_low self.scale_high = scale_high def __call__(self, data, label): scale = np.random.uniform(self.scale_low, self.scale_high) data[:, 0:3] *= scale return data, label class RandomShift(object): def __init__(self, shift_range=0.1): self.shift_range = shift_range def __call__(self, data, label): shift = np.random.uniform(-self.shift_range, self.shift_range, 3) data[:, 0:3] += shift return data, label class RandomJitter(object): def __init__(self, sigma=0.01, clip=0.05): self.sigma = sigma self.clip = clip def __call__(self, data, label): assert (self.clip > 0) jitter = np.clip(self.sigma * np.random.randn(data.shape[0], 3), -1 * self.clip, self.clip) data[:, 0:3] += jitter return data, label ================================================ FILE: util/util.py ================================================ import os import numpy as np from PIL import Image import torch from torch import nn from torch.nn.modules.conv import _ConvNd from torch.nn.modules.batchnorm import _BatchNorm import torch.nn.init as initer class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def step_learning_rate(optimizer, base_lr, epoch, step_epoch, multiplier=0.1, clip=1e-6): """Sets the learning rate to the base LR decayed by 10 every step epochs""" lr = max(base_lr * (multiplier ** (epoch // step_epoch)), clip) for param_group in optimizer.param_groups: param_group['lr'] = lr def poly_learning_rate(optimizer, base_lr, curr_iter, max_iter, power=0.9): """poly learning rate policy""" lr = base_lr * (1 - float(curr_iter) / max_iter) ** power for param_group in optimizer.param_groups: param_group['lr'] = lr def intersectionAndUnion(output, target, K, ignore_index=255): # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1. assert (output.ndim in [1, 2, 3]) assert output.shape == target.shape output = output.reshape(output.size).copy() target = target.reshape(target.size) output[np.where(target == ignore_index)[0]] = 255 intersection = output[np.where(output == target)[0]] area_intersection, _ = np.histogram(intersection, bins=np.arange(K+1)) area_output, _ = np.histogram(output, bins=np.arange(K+1)) area_target, _ = np.histogram(target, bins=np.arange(K+1)) area_union = area_output + area_target - area_intersection return area_intersection, area_union, area_target def intersectionAndUnionGPU(output, target, K, ignore_index=255): # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1. assert (output.dim() in [1, 2, 3]) assert output.shape == target.shape output = output.view(-1) target = target.view(-1) output[target == ignore_index] = ignore_index intersection = output[output == target] # https://github.com/pytorch/pytorch/issues/1382 area_intersection = torch.histc(intersection.float().cpu(), bins=K, min=0, max=K-1) area_output = torch.histc(output.float().cpu(), bins=K, min=0, max=K-1) area_target = torch.histc(target.float().cpu(), bins=K, min=0, max=K-1) area_union = area_output + area_target - area_intersection return area_intersection.cuda(), area_union.cuda(), area_target.cuda() def check_mkdir(dir_name): if not os.path.exists(dir_name): os.mkdir(dir_name) def check_makedirs(dir_name): if not os.path.exists(dir_name): os.makedirs(dir_name) def init_weights(model, conv='kaiming', batchnorm='normal', linear='kaiming', lstm='kaiming'): """ :param model: Pytorch Model which is nn.Module :param conv: 'kaiming' or 'xavier' :param batchnorm: 'normal' or 'constant' :param linear: 'kaiming' or 'xavier' :param lstm: 'kaiming' or 'xavier' """ for m in model.modules(): if isinstance(m, (_ConvNd)): if conv == 'kaiming': initer.kaiming_normal_(m.weight) elif conv == 'xavier': initer.xavier_normal_(m.weight) else: raise ValueError("init type of conv error.\n") if m.bias is not None: initer.constant_(m.bias, 0) elif isinstance(m, _BatchNorm): if batchnorm == 'normal': initer.normal_(m.weight, 1.0, 0.02) elif batchnorm == 'constant': initer.constant_(m.weight, 1.0) else: raise ValueError("init type of batchnorm error.\n") initer.constant_(m.bias, 0.0) elif isinstance(m, nn.Linear): if linear == 'kaiming': initer.kaiming_normal_(m.weight) elif linear == 'xavier': initer.xavier_normal_(m.weight) else: raise ValueError("init type of linear error.\n") if m.bias is not None: initer.constant_(m.bias, 0) elif isinstance(m, nn.LSTM): for name, param in m.named_parameters(): if 'weight' in name: if lstm == 'kaiming': initer.kaiming_normal_(param) elif lstm == 'xavier': initer.xavier_normal_(param) else: raise ValueError("init type of lstm error.\n") elif 'bias' in name: initer.constant_(param, 0) def convert_to_syncbn(model): def recursive_set(cur_module, name, module): if len(name.split('.')) > 1: recursive_set(getattr(cur_module, name[:name.find('.')]), name[name.find('.')+1:], module) else: setattr(cur_module, name, module) from lib.sync_bn import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d for name, m in model.named_modules(): if isinstance(m, nn.BatchNorm1d): recursive_set(model, name, SynchronizedBatchNorm1d(m.num_features, m.eps, m.momentum, m.affine)) elif isinstance(m, nn.BatchNorm2d): recursive_set(model, name, SynchronizedBatchNorm2d(m.num_features, m.eps, m.momentum, m.affine)) elif isinstance(m, nn.BatchNorm3d): recursive_set(model, name, SynchronizedBatchNorm3d(m.num_features, m.eps, m.momentum, m.affine)) def colorize(gray, palette): # gray: numpy array of the label and 1*3N size list palette color = Image.fromarray(gray.astype(np.uint8)).convert('P') color.putpalette(palette) return color