Repository: castacks/DytanVO Branch: main Commit: ca2fe20029e9 Files: 61 Total size: 462.0 KB Directory structure: gitextract_hty7qque/ ├── .gitignore ├── Datasets/ │ ├── __init__.py │ ├── cowmask.py │ ├── flowlib.py │ ├── segmask_gt.py │ ├── tartanTrajFlowDataset.py │ ├── util_flow.py │ └── utils.py ├── DytanVO.py ├── LICENSE ├── Network/ │ ├── PWC/ │ │ ├── PWCNet.py │ │ ├── __init__.py │ │ └── correlation.py │ ├── VOFlowNet.py │ ├── VONet.py │ ├── __init__.py │ └── rigidmask/ │ ├── .gitignore │ ├── VCNplus.py │ ├── __init__.py │ ├── conv4d.py │ ├── det.py │ ├── det_losses.py │ ├── det_utils.py │ ├── networks/ │ │ ├── DCNv2/ │ │ │ ├── .gitignore │ │ │ ├── DCN/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dcn_v2.py │ │ │ │ ├── src/ │ │ │ │ │ ├── cpu/ │ │ │ │ │ │ ├── dcn_v2_cpu.cpp │ │ │ │ │ │ ├── dcn_v2_im2col_cpu.cpp │ │ │ │ │ │ ├── dcn_v2_im2col_cpu.h │ │ │ │ │ │ ├── dcn_v2_psroi_pooling_cpu.cpp │ │ │ │ │ │ └── vision.h │ │ │ │ │ ├── cuda/ │ │ │ │ │ │ ├── dcn_v2_cuda.cu │ │ │ │ │ │ ├── dcn_v2_im2col_cuda.cu │ │ │ │ │ │ ├── dcn_v2_im2col_cuda.h │ │ │ │ │ │ ├── dcn_v2_psroi_pooling_cuda.cu │ │ │ │ │ │ └── vision.h │ │ │ │ │ ├── dcn_v2.h │ │ │ │ │ └── vision.cpp │ │ │ │ ├── testcpu.py │ │ │ │ └── testcuda.py │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── make.sh │ │ │ └── setup.py │ │ ├── dlav0.py │ │ ├── large_hourglass.py │ │ ├── msra_resnet.py │ │ ├── pose_dla_dcn.py │ │ └── resnet_dcn.py │ └── submodule.py ├── README.md ├── environment.yml ├── evaluator/ │ ├── __init__.py │ ├── evaluate_ate_scale.py │ ├── evaluate_kitti.py │ ├── evaluate_rpe.py │ ├── evaluator_base.py │ ├── tartanair_evaluator.py │ ├── trajectory_transform.py │ └── transformation.py └── vo_trajectory_from_folder.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc models/ data/ __pycache__/ .DS_Store ================================================ FILE: Datasets/__init__.py ================================================ ================================================ FILE: Datasets/cowmask.py ================================================ # pylint: disable=bad-indentation # coding=utf-8 # Copyright 2022 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Cow mask generation. https://github.com/google-research/google-research/blob/master/milking_cowmask/ Adapted from LAX implementation to NumPy due to PyTorch dataloader being incompatible with JAX Author: Shihao Shen Date: 29th Aug 2022 """ import math import numpy as np from scipy import special from scipy.signal import convolve _ROOT_2 = math.sqrt(2.0) _ROOT_2_PI = math.sqrt(2.0 * math.pi) def gaussian_kernels(sigma, max_sigma): """Make Gaussian kernels for Gaussian blur. Args: sigma: kernel sigma max_sigma: sigma upper limit as a float (this is used to determine the size of kernel required to fit all kernels) Returns: a (1, kernel_width) numpy array """ size = round(max_sigma * 3) * 2 + 1 x = np.arange(-size, size + 1)[None, :].astype(np.float32) y = np.exp(-0.5 * x ** 2 / sigma ** 2) return y / (sigma * _ROOT_2_PI) def cow_masks(mask_size, log_sigma_range, max_sigma, prop_range): """Generate Cow Mask. Args: n_masks: number of masks to generate as an int mask_size: image size as a `(height, width)` tuple log_sigma_range: the range of the sigma (smoothing kernel) parameter in log-space`(log(sigma_min), log(sigma_max))` max_sigma: smoothing sigma upper limit prop_range: range from which to draw the proportion `p` that controls the proportion of pixel in a mask that are 1 vs 0 Returns: Cow Masks as a [v, height, width, 1] numpy array """ # Draw the per-mask proportion p p = np.random.uniform(prop_range[0], prop_range[1]) # Compute threshold factors threshold_factor = special.erfinv(2 * p - 1) * _ROOT_2 sigma = np.exp(np.random.uniform(log_sigma_range[0], log_sigma_range[1])) noise = np.random.normal(size=mask_size) # Generate a kernel for each sigma kernel = gaussian_kernels(sigma, max_sigma) kernel = kernel.squeeze() # kernels in y and x krn_y = kernel[None, :] krn_x = kernel[:, None] # Apply kernels in y and x separately smooth_noise = convolve(noise, krn_y, mode='same') smooth_noise = convolve(smooth_noise, krn_x, mode='same') # Compute mean and std-dev noise_mu = smooth_noise.mean(axis=(0,1)) noise_sigma = smooth_noise.std(axis=(0,1)) # Compute thresholds threshold = threshold_factor * noise_sigma + noise_mu # Apply threshold mask = (smooth_noise <= threshold).astype(bool) return mask if __name__=="__main__": import time import matplotlib.pyplot as plt cow_sigma_range = (20, 60) log_sigma_range = (math.log(cow_sigma_range[0]), math.log(cow_sigma_range[1])) cow_prop_range = (0.1, 0.5) s = time.time() max_iou = 0 # for _ in range(1000): # mask = cow_masks((240, 360), log_sigma_range, cow_sigma_range[1], cow_prop_range) # max_iou = max(max_iou, np.sum(mask) / (240*360)) # print(time.time() - s) # print(max_iou) mask = cow_masks((240, 360), log_sigma_range, cow_sigma_range[1], cow_prop_range) print(np.sum(mask) / (240*360)) plt.imshow(mask * 255) plt.savefig('mask.png') ================================================ FILE: Datasets/flowlib.py ================================================ """ # ============================== # flowlib.py # library for optical flow processing # Author: Ruoteng Li # Date: 6th Aug 2016 # ============================== """ import png from util_flow import readPFM import numpy as np import matplotlib.colors as cl import matplotlib.pyplot as plt from PIL import Image import cv2 import pdb UNKNOWN_FLOW_THRESH = 1e7 SMALLFLOW = 0.0 LARGEFLOW = 1e8 """ ============= Flow Section ============= """ def show_flow(filename): """ visualize optical flow map using matplotlib :param filename: optical flow file :return: None """ flow = read_flow(filename) img = flow_to_image(flow) plt.imshow(img) plt.show() def point_vec(img,flow,skip=10): skip=20 maxsize=1000. extendfac=2. resize_factor = max(1,int(max(maxsize/img.shape[0], maxsize/img.shape[1]))) meshgrid = np.meshgrid(range(img.shape[1]),range(img.shape[0])) dispimg = cv2.resize(img[:,:,::-1].copy(), None,fx=resize_factor,fy=resize_factor) colorflow = flow_to_image(flow).astype(int) for i in range(img.shape[1]): # x for j in range(img.shape[0]): # y if flow[j,i,2] != 1: continue if j%skip!=0 or i%skip!=0: continue xend = int((meshgrid[0][j,i]+extendfac*flow[j,i,0])*resize_factor) yend = int((meshgrid[1][j,i]+extendfac*flow[j,i,1])*resize_factor) leng = np.linalg.norm(flow[j,i,:2]*extendfac) if leng<1:continue dispimg = cv2.arrowedLine(dispimg, (meshgrid[0][j,i]*resize_factor,meshgrid[1][j,i]*resize_factor),\ (xend,yend), (int(colorflow[j,i,2]),int(colorflow[j,i,1]),int(colorflow[j,i,0])),4,tipLength=2/leng,line_type=cv2.LINE_AA) return dispimg def visualize_flow(flow, mode='Y'): """ this function visualize the input flow :param flow: input flow in array :param mode: choose which color mode to visualize the flow (Y: Ccbcr, RGB: RGB color) :return: None """ if mode == 'Y': # Ccbcr color wheel img = flow_to_image(flow) elif mode == 'RGB': (h, w) = flow.shape[0:2] du = flow[:, :, 0] dv = flow[:, :, 1] valid = flow[:, :, 2] max_flow = np.sqrt(du**2+dv**2).max() img = np.zeros((h, w, 3), dtype=np.float64) # angle layer img[:, :, 0] = np.fmod(np.arctan2(dv, du) / (2 * np.pi)+1.,1.) # magnitude layer, normalized to 1 img[:, :, 1] = np.sqrt(du * du + dv * dv) * 8 / max_flow # phase layer img[:, :, 2] = 8 - img[:, :, 1] # clip to [0,1] small_idx = img[:, :, 0:3] < 0 large_idx = img[:, :, 0:3] > 1 img[small_idx] = 0 img[large_idx] = 1 # convert to rgb img = cl.hsv_to_rgb(img) # remove invalid point img[:, :, 0] = img[:, :, 0] * valid img[:, :, 1] = img[:, :, 1] * valid img[:, :, 2] = img[:, :, 2] * valid return img def read_flow(filename): """ read optical flow data from flow file :param filename: name of the flow file :return: optical flow data in numpy array """ if filename.endswith('.flo'): flow = read_flo_file(filename) elif filename.endswith('.png'): flow = read_png_file(filename) elif filename.endswith('.pfm'): flow = read_pfm_file(filename) else: raise Exception('Invalid flow file format!') return flow import numpy as np import os def write_flo(flow, filename): TAG_STRING = b'PIEH' assert type(filename) is str, "file is not str %r" % str(filename) assert filename[-4:] == '.flo', "file ending is not .flo %r" % file[-4:] height, width, nBands = flow.shape assert nBands == 2, "Number of bands = %r != 2" % nBands u = flow[: , : , 0] v = flow[: , : , 1] assert u.shape == v.shape, "Invalid flow shape" height, width = u.shape f = open(filename,'wb') f.write(TAG_STRING) np.array(width).astype(np.int32).tofile(f) np.array(height).astype(np.int32).tofile(f) tmp = np.zeros((height, width*nBands)) tmp[:,np.arange(width)*2] = u tmp[:,np.arange(width)*2 + 1] = v tmp.astype(np.float32).tofile(f) f.close() def write_flow(flow, filename): """ write optical flow in Middlebury .flo format :param flow: optical flow map :param filename: optical flow file path to be saved :return: None """ f = open(filename, 'wb') magic = np.array([202021.25], dtype=np.float32) (height, width) = flow.shape[0:2] w = np.array([width], dtype=np.int32) h = np.array([height], dtype=np.int32) magic.tofile(f) w.tofile(f) h.tofile(f) flow.tofile(f) f.close() def save_flow_image(flow, image_file): """ save flow visualization into image file :param flow: optical flow data :param flow_fil :return: None """ flow_img = flow_to_image(flow) img_out = Image.fromarray(flow_img) img_out.save(image_file) def flowfile_to_imagefile(flow_file, image_file): """ convert flowfile into image file :param flow: optical flow data :param flow_fil :return: None """ flow = read_flow(flow_file) save_flow_image(flow, image_file) def segment_flow(flow): h = flow.shape[0] w = flow.shape[1] u = flow[:, :, 0] v = flow[:, :, 1] idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW)) idx2 = (abs(u) == SMALLFLOW) class0 = (v == 0) & (u == 0) u[idx2] = 0.00001 tan_value = v / u class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0) class2 = (tan_value >= 1) & (u >= 0) & (v >= 0) class3 = (tan_value < -1) & (u <= 0) & (v >= 0) class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0) class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0) class7 = (tan_value < -1) & (u >= 0) & (v <= 0) class6 = (tan_value >= 1) & (u <= 0) & (v <= 0) class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0) seg = np.zeros((h, w)) seg[class1] = 1 seg[class2] = 2 seg[class3] = 3 seg[class4] = 4 seg[class5] = 5 seg[class6] = 6 seg[class7] = 7 seg[class8] = 8 seg[class0] = 0 seg[idx] = 0 return seg def flow_error(tu, tv, u, v): """ Calculate average end point error :param tu: ground-truth horizontal flow map :param tv: ground-truth vertical flow map :param u: estimated horizontal flow map :param v: estimated vertical flow map :return: End point error of the estimated flow """ smallflow = 0.0 ''' stu = tu[bord+1:end-bord,bord+1:end-bord] stv = tv[bord+1:end-bord,bord+1:end-bord] su = u[bord+1:end-bord,bord+1:end-bord] sv = v[bord+1:end-bord,bord+1:end-bord] ''' stu = tu[:] stv = tv[:] su = u[:] sv = v[:] idxUnknow = (abs(stu) > UNKNOWN_FLOW_THRESH) | (abs(stv) > UNKNOWN_FLOW_THRESH) stu[idxUnknow] = 0 stv[idxUnknow] = 0 su[idxUnknow] = 0 sv[idxUnknow] = 0 ind2 = [(np.absolute(stu) > smallflow) | (np.absolute(stv) > smallflow)] index_su = su[ind2] index_sv = sv[ind2] an = 1.0 / np.sqrt(index_su ** 2 + index_sv ** 2 + 1) un = index_su * an vn = index_sv * an index_stu = stu[ind2] index_stv = stv[ind2] tn = 1.0 / np.sqrt(index_stu ** 2 + index_stv ** 2 + 1) tun = index_stu * tn tvn = index_stv * tn ''' angle = un * tun + vn * tvn + (an * tn) index = [angle == 1.0] angle[index] = 0.999 ang = np.arccos(angle) mang = np.mean(ang) mang = mang * 180 / np.pi ''' epe = np.sqrt((stu - su) ** 2 + (stv - sv) ** 2) epe = epe[ind2] mepe = np.mean(epe) return mepe def flow_to_image(flow): """ Convert flow into middlebury color code image :param flow: optical flow map :return: optical flow image in middlebury color """ u = flow[:, :, 0] v = flow[:, :, 1] maxu = -999. maxv = -999. minu = 999. minv = 999. idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH) u[idxUnknow] = 0 v[idxUnknow] = 0 maxu = max(maxu, np.max(u)) minu = min(minu, np.min(u)) maxv = max(maxv, np.max(v)) minv = min(minv, np.min(v)) rad = np.sqrt(u ** 2 + v ** 2) maxrad = max(-1, np.max(rad)) u = u/(maxrad + np.finfo(float).eps) v = v/(maxrad + np.finfo(float).eps) img = compute_color(u, v) idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2) img[idx] = 0 return np.uint8(img) def evaluate_flow_file(gt_file, pred_file): """ evaluate the estimated optical flow end point error according to ground truth provided :param gt_file: ground truth file path :param pred_file: estimated optical flow file path :return: end point error, float32 """ # Read flow files and calculate the errors gt_flow = read_flow(gt_file) # ground truth flow eva_flow = read_flow(pred_file) # predicted flow # Calculate errors average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], eva_flow[:, :, 0], eva_flow[:, :, 1]) return average_pe def evaluate_flow(gt_flow, pred_flow): """ gt: ground-truth flow pred: estimated flow """ average_pe = flow_error(gt_flow[:, :, 0], gt_flow[:, :, 1], pred_flow[:, :, 0], pred_flow[:, :, 1]) return average_pe """ ============== Disparity Section ============== """ def read_disp_png(file_name): """ Read optical flow from KITTI .png file :param file_name: name of the flow file :return: optical flow data in matrix """ image_object = png.Reader(filename=file_name) image_direct = image_object.asDirect() image_data = list(image_direct[2]) (w, h) = image_direct[3]['size'] channel = len(image_data[0]) / w flow = np.zeros((h, w, channel), dtype=np.uint16) for i in range(len(image_data)): for j in range(channel): flow[i, :, j] = image_data[i][j::channel] return flow[:, :, 0] / 256 def disp_to_flowfile(disp, filename): """ Read KITTI disparity file in png format :param disp: disparity matrix :param filename: the flow file name to save :return: None """ f = open(filename, 'wb') magic = np.array([202021.25], dtype=np.float32) (height, width) = disp.shape[0:2] w = np.array([width], dtype=np.int32) h = np.array([height], dtype=np.int32) empty_map = np.zeros((height, width), dtype=np.float32) data = np.dstack((disp, empty_map)) magic.tofile(f) w.tofile(f) h.tofile(f) data.tofile(f) f.close() """ ============== Image Section ============== """ def read_image(filename): """ Read normal image of any format :param filename: name of the image file :return: image data in matrix uint8 type """ img = Image.open(filename) im = np.array(img) return im def warp_image(im, flow): """ Use optical flow to warp image to the next :param im: image to warp :param flow: optical flow :return: warped image """ from scipy import interpolate image_height = im.shape[0] image_width = im.shape[1] flow_height = flow.shape[0] flow_width = flow.shape[1] n = image_height * image_width (iy, ix) = np.mgrid[0:image_height, 0:image_width] (fy, fx) = np.mgrid[0:flow_height, 0:flow_width] fx = fx.astype(np.float64) fy = fy.astype(np.float64) fx += flow[:,:,0] fy += flow[:,:,1] mask = np.logical_or(fx <0 , fx > flow_width) mask = np.logical_or(mask, fy < 0) mask = np.logical_or(mask, fy > flow_height) fx = np.minimum(np.maximum(fx, 0), flow_width) fy = np.minimum(np.maximum(fy, 0), flow_height) points = np.concatenate((ix.reshape(n,1), iy.reshape(n,1)), axis=1) xi = np.concatenate((fx.reshape(n, 1), fy.reshape(n,1)), axis=1) warp = np.zeros((image_height, image_width, im.shape[2])) for i in range(im.shape[2]): channel = im[:, :, i] plt.imshow(channel, cmap='gray') values = channel.reshape(n, 1) new_channel = interpolate.griddata(points, values, xi, method='cubic') new_channel = np.reshape(new_channel, [flow_height, flow_width]) new_channel[mask] = 1 warp[:, :, i] = new_channel.astype(np.uint8) return warp.astype(np.uint8) """ ============== Others ============== """ def pfm_to_flo(pfm_file): flow_filename = pfm_file[0:pfm_file.find('.pfm')] + '.flo' (data, scale) = readPFM(pfm_file) flow = data[:, :, 0:2] write_flow(flow, flow_filename) def scale_image(image, new_range): """ Linearly scale the image into desired range :param image: input image :param new_range: the new range to be aligned :return: image normalized in new range """ min_val = np.min(image).astype(np.float32) max_val = np.max(image).astype(np.float32) min_val_new = np.array(min(new_range), dtype=np.float32) max_val_new = np.array(max(new_range), dtype=np.float32) scaled_image = (image - min_val) / (max_val - min_val) * (max_val_new - min_val_new) + min_val_new return scaled_image.astype(np.uint8) def compute_color(u, v): """ compute optical flow color map :param u: optical flow horizontal map :param v: optical flow vertical map :return: optical flow in color code """ [h, w] = u.shape img = np.zeros([h, w, 3]) nanIdx = np.isnan(u) | np.isnan(v) u[nanIdx] = 0 v[nanIdx] = 0 colorwheel = make_color_wheel() ncols = np.size(colorwheel, 0) rad = np.sqrt(u**2+v**2) a = np.arctan2(-v, -u) / np.pi fk = (a+1) / 2 * (ncols - 1) + 1 k0 = np.floor(fk).astype(int) k1 = k0 + 1 k1[k1 == ncols+1] = 1 f = fk - k0 for i in range(0, np.size(colorwheel,1)): tmp = colorwheel[:, i] col0 = tmp[k0-1] / 255 col1 = tmp[k1-1] / 255 col = (1-f) * col0 + f * col1 idx = rad <= 1 col[idx] = 1-rad[idx]*(1-col[idx]) notidx = np.logical_not(idx) col[notidx] *= 0.75 img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx))) return img def make_color_wheel(): """ Generate color wheel according Middlebury color code :return: Color wheel """ RY = 15 YG = 6 GC = 4 CB = 11 BM = 13 MR = 6 ncols = RY + YG + GC + CB + BM + MR colorwheel = np.zeros([ncols, 3]) col = 0 # RY colorwheel[0:RY, 0] = 255 colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY)) col += RY # YG colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG)) colorwheel[col:col+YG, 1] = 255 col += YG # GC colorwheel[col:col+GC, 1] = 255 colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC)) col += GC # CB colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB)) colorwheel[col:col+CB, 2] = 255 col += CB # BM colorwheel[col:col+BM, 2] = 255 colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM)) col += + BM # MR colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR)) colorwheel[col:col+MR, 0] = 255 return colorwheel def read_flo_file(filename): """ Read from Middlebury .flo file :param flow_file: name of the flow file :return: optical flow data in matrix """ f = open(filename, 'rb') magic = np.fromfile(f, np.float32, count=1) data2d = None if 202021.25 != magic: print('Magic number incorrect. Invalid .flo file') else: w = np.fromfile(f, np.int32, count=1) h = np.fromfile(f, np.int32, count=1) #print("Reading %d x %d flow file in .flo format" % (h, w)) flow = np.ones((h[0],w[0],3)) data2d = np.fromfile(f, np.float32, count=2 * w[0] * h[0]) # reshape data into 3D array (columns, rows, channels) data2d = np.resize(data2d, (h[0], w[0], 2)) flow[:,:,:2] = data2d f.close() return flow def read_png_file(flow_file): """ Read from KITTI .png file :param flow_file: name of the flow file :return: optical flow data in matrix """ flow = cv2.imread(flow_file,-1)[:,:,::-1].astype(np.float64) # flow_object = png.Reader(filename=flow_file) # flow_direct = flow_object.asDirect() # flow_data = list(flow_direct[2]) # (w, h) = flow_direct[3]['size'] # #print("Reading %d x %d flow file in .png format" % (h, w)) # flow = np.zeros((h, w, 3), dtype=np.float64) # for i in range(len(flow_data)): # flow[i, :, 0] = flow_data[i][0::3] # flow[i, :, 1] = flow_data[i][1::3] # flow[i, :, 2] = flow_data[i][2::3] invalid_idx = (flow[:, :, 2] == 0) flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0 flow[invalid_idx, 0] = 0 flow[invalid_idx, 1] = 0 return flow def read_pfm_file(flow_file): """ Read from .pfm file :param flow_file: name of the flow file :return: optical flow data in matrix """ (data, scale) = readPFM(flow_file) return data # fast resample layer def resample(img, sz): """ img: flow map to be resampled sz: new flow map size. Must be [height,weight] """ original_image_size = img.shape in_height = img.shape[0] in_width = img.shape[1] out_height = sz[0] out_width = sz[1] out_flow = np.zeros((out_height, out_width, 2)) # find scale height_scale = float(in_height) / float(out_height) width_scale = float(in_width) / float(out_width) [x,y] = np.meshgrid(range(out_width), range(out_height)) xx = x * width_scale yy = y * height_scale x0 = np.floor(xx).astype(np.int32) x1 = x0 + 1 y0 = np.floor(yy).astype(np.int32) y1 = y0 + 1 x0 = np.clip(x0,0,in_width-1) x1 = np.clip(x1,0,in_width-1) y0 = np.clip(y0,0,in_height-1) y1 = np.clip(y1,0,in_height-1) Ia = img[y0,x0,:] Ib = img[y1,x0,:] Ic = img[y0,x1,:] Id = img[y1,x1,:] wa = (y1-yy) * (x1-xx) wb = (yy-y0) * (x1-xx) wc = (y1-yy) * (xx-x0) wd = (yy-y0) * (xx-x0) out_flow[:,:,0] = (Ia[:,:,0]*wa + Ib[:,:,0]*wb + Ic[:,:,0]*wc + Id[:,:,0]*wd) * out_width / in_width out_flow[:,:,1] = (Ia[:,:,1]*wa + Ib[:,:,1]*wb + Ic[:,:,1]*wc + Id[:,:,1]*wd) * out_height / in_height return out_flow ================================================ FILE: Datasets/segmask_gt.py ================================================ """ # ============================== # segmask_gt.py # library to generate groundtruth # segmentation mask given flow and # disparity change # (Adapted from code for rigidmask) # Author: Shihao Shen # Date: 14th Sep 2022 # ============================== """ import argparse import os import os.path import glob import numpy as np import cv2 from PIL import Image from flowlib import read_flow, readPFM, flow_to_image def dataloader(filepath, fpass='frames_cleanpass', level=6): iml0 = [] iml1 = [] flowl0 = [] disp0 = [] dispc = [] calib = [] level_stars = '/*'*level candidate_pool = glob.glob('%s/optical_flow%s'%(filepath,level_stars)) for flow_path in sorted(candidate_pool): # if 'TEST' in flow_path: continue if 'flower_storm_x2/into_future/right/OpticalFlowIntoFuture_0023_R.pfm' in flow_path: print('Skipping %s' % flow_path) continue if 'flower_storm_x2/into_future/left/OpticalFlowIntoFuture_0023_L.pfm' in flow_path: print('Skipping %s' % flow_path) continue if 'flower_storm_augmented0_x2/into_future/right/OpticalFlowIntoFuture_0023_R.pfm' in flow_path: print('Skipping %s' % flow_path) continue if 'flower_storm_augmented0_x2/into_future/left/OpticalFlowIntoFuture_0023_L.pfm' in flow_path: print('Skipping %s' % flow_path) continue # if 'FlyingThings' in flow_path and '_0014_' in flow_path: # print('Skipping %s' % flow_path) # continue # if 'FlyingThings' in flow_path and '_0015_' in flow_path: # print('Skipping %s' % flow_path) # continue idd = flow_path.split('/')[-1].split('_')[-2] if 'into_future' in flow_path: idd_p1 = '%04d'%(int(idd)+1) else: idd_p1 = '%04d'%(int(idd)-1) if os.path.exists(flow_path.replace(idd,idd_p1)): d0_path = flow_path.replace('/into_future/','/').replace('/into_past/','/').replace('optical_flow','disparity') d0_path = '%s/%s.pfm'%(d0_path.rsplit('/',1)[0],idd) dc_path = flow_path.replace('optical_flow','disparity_change') dc_path = '%s/%s.pfm'%(dc_path.rsplit('/',1)[0],idd) im_path = flow_path.replace('/into_future/','/').replace('/into_past/','/').replace('optical_flow',fpass) im0_path = '%s/%s.png'%(im_path.rsplit('/',1)[0],idd) im1_path = '%s/%s.png'%(im_path.rsplit('/',1)[0],idd_p1) # This will skip any sequence that contains less than 10 poses in camera_data.txt with open('%s/camera_data.txt'%(im0_path.replace(fpass,'camera_data').rsplit('/',2)[0]),'r') as f: if 'FlyingThings' in flow_path and len(f.readlines())!=40: print('Skipping %s' % flow_path) continue iml0.append(im0_path) iml1.append(im1_path) flowl0.append(flow_path) disp0.append(d0_path) dispc.append(dc_path) calib.append('%s/camera_data.txt'%(im0_path.replace(fpass,'camera_data').rsplit('/',2)[0])) return iml0, iml1, flowl0, disp0, dispc, calib def default_loader(path): return Image.open(path).convert('RGB') def flow_loader(path): if '.pfm' in path: data = readPFM(path)[0] data[:,:,2] = 1 return data else: return read_flow(path) def load_exts(cam_file): with open(cam_file, 'r') as f: lines = f.readlines() l_exts = [] r_exts = [] for l in lines: if 'L ' in l: l_exts.append(np.asarray([float(i) for i in l[2:].strip().split(' ')]).reshape(4,4)) if 'R ' in l: r_exts.append(np.asarray([float(i) for i in l[2:].strip().split(' ')]).reshape(4,4)) return l_exts,r_exts def disparity_loader(path): if '.png' in path: data = Image.open(path) data = np.ascontiguousarray(data,dtype=np.float32)/256 return data else: return readPFM(path)[0] # triangulation def triangulation(disp, xcoord, ycoord, bl=1, fl = 450, cx = 479.5, cy = 269.5): depth = bl*fl / disp # 450px->15mm focal length X = (xcoord - cx) * depth / fl Y = (ycoord - cy) * depth / fl Z = depth P = np.concatenate((X[np.newaxis],Y[np.newaxis],Z[np.newaxis]),0).reshape(3,-1) P = np.concatenate((P,np.ones((1,P.shape[-1]))),0) return P def exp_loader(index, iml0s, iml1s, flowl0s, disp0s=None, dispcs=None, calibs=None): ''' index: index of the frame in the file lists below iml0s: a file list of the first frames iml1s: a file list of the second frames flowl0s: a file list of the optical w.r.t. iml0s disp0s: a file list of the disparity w.r.t. iml0s dispcs: a file list of the disparity change w.r.t. disp0s calibs: a file list of the camera extrinsics ''' iml0 = iml0s[index] iml1 = iml1s[index] flowl0 = flowl0s[index] iml0 = default_loader(iml0) iml1 = default_loader(iml1) flowl0 = flow_loader(flowl0) flowl0[:,:,-1][flowl0[:,:,0] == np.inf] = 0 flowl0[:,:,0][~flowl0[:,:,2].astype(bool)] = 0 flowl0[:,:,1][~flowl0[:,:,2].astype(bool)] = 0 flowl0 = np.ascontiguousarray(flowl0, dtype=np.float32) flowl0[np.isnan(flowl0)] = 1e6 bl = 1 if '15mm_' in calibs[index]: fl = 450 else: fl = 1050 cx = 479.5 cy = 269.5 intr = [[fl],[cx],[cy],[bl],[1],[0],[0],[1],[0],[0]] d1 = np.abs(disparity_loader(disp0s[index])) d2 = np.abs(disparity_loader(dispcs[index]) + d1) flowl0[:,:,2] = np.logical_and(np.logical_and(flowl0[:,:,2] == 1, d1 != 0), d2 != 0).astype(float) shape = d1.shape mesh = np.meshgrid(range(shape[1]), range(shape[0])) xcoord = mesh[0].astype(float) ycoord = mesh[1].astype(float) # triangulation in two frames P0 = triangulation(d1, xcoord, ycoord, bl=bl, fl=fl, cx=cx, cy=cy) P1 = triangulation(d2, xcoord + flowl0[:,:,0], ycoord + flowl0[:,:,1], bl=bl, fl=fl, cx=cx, cy=cy) depth0 = P0[2] depth1 = P1[2] depth0 = depth0.reshape(shape).astype(np.float32) flow3d = (P1-P0)[:3].reshape((3,)+shape).transpose((1,2,0)) fid = int(flowl0s[index].split('/')[-1].split('_')[1]) with open(calibs[index], 'r') as f: fid = fid - int(f.readline().split(' ')[-1]) l_exts, r_exts= load_exts(calibs[index]) if '/right/' in iml0s[index]: exts = r_exts else: exts = l_exts if '/into_future/' in flowl0s[index]: if (fid + 1) > len(exts) - 1: print(flowl0s[index]) if (fid) > len(exts) - 1: print(flowl0s[index]) ext1 = exts[fid+1] ext0 = exts[fid] else: if (fid - 1) > len(exts) - 1: print(flowl0s[index]) if (fid) > len(exts) - 1: print(flowl0s[index]) ext1 = exts[fid-1] ext0 = exts[fid] camT = np.eye(4); camT[1,1] = -1; camT[2,2] = -1 # Sceneflow uses Blender's coordinate system RT01 = camT.dot(np.linalg.inv(ext0)).dot(ext1).dot(camT) # ext is from camera space to world space rect_flow3d = (RT01[:3,:3].dot(P1[:3])-P0[:3]).reshape((3,)+shape).transpose((1,2,0)) # rectified scene flow depthflow = np.concatenate((depth0[:,:,np.newaxis], rect_flow3d, flow3d), 2) RT01 = np.concatenate((cv2.Rodrigues(RT01[:3,:3])[0][:,0], RT01[:3,-1])).astype(np.float32) # object mask fnum = int(iml0s[index].split('/')[-1].split('.png')[0]) obj_fname = '%s/%04d.pfm'%(flowl0s[index].replace('/optical_flow','object_index').replace('into_past/','/').replace('into_future/','/').rsplit('/',1)[0],fnum) obj_idx = disparity_loader(obj_fname) depthflow = np.concatenate((depthflow, obj_idx[:,:,np.newaxis]), 2) # depthflow dimension: H x W x 8 (depth=1 + rectified_flow3d=3 + flow3d=3 + object_segmentation=1) iml1 = np.asarray(iml1) iml0 = np.asarray(iml0) return iml0, iml1, flowl0, depthflow, intr, RT01 def motionmask(flowl0, depthflow, RT01): ''' flowl0: optical flow. [H, W, 3] depthflow: a concatenation of depth, rectified scene flow, scene flow, and object segmentation. [H, W, 8] RT01: camera motion from the future frame to the current frame. [6, ] ''' valid_mask = (flowl0[:,:,2] == 1) & (depthflow[:,:,0] < 100) & (depthflow[:,:,0] > 0.01) # valid flow & valid depth Tglobal_gt = -RT01[3:, np.newaxis, np.newaxis] # background translation Tlocal_gt = depthflow[:,:,1:4].transpose(2, 0, 1) # point translation (after removing rotation) m3d_gt = np.linalg.norm(Tlocal_gt - Tglobal_gt, 2, 0) # abs. motion fgmask_gt = m3d_gt * 100 > 1 fgmask_gt[~valid_mask] = False return fgmask_gt if __name__ == '__main__': parser = argparse.ArgumentParser(description='segmask_gt_generation') parser.add_argument('--database', help='path to the database (required)') parser.add_argument('--debug', action='store_true', default=False, help='generate visualization') parser.add_argument('--frames_pass', default='frames_cleanpass', help='which pass to use, either clean or final') parser.add_argument('--dataset', help='choose from FlyingThings3D, Driving, Monkaa') args = parser.parse_args() if args.debug: os.makedirs('%s/%s/results_viz' % (args.database, args.dataset), exist_ok=True) if args.dataset == 'Monkaa': level = 4 else: level = 6 iml0s, iml1s, flowl0s, disp0s, dispcs, calibs = dataloader('%s/%s/' % (args.database, args.dataset), level=level, fpass=args.frames_pass) print("Generating %s masks..." % len(flowl0s)) for i in range(len(iml0s)): idd = flowl0s[i].split('/')[-1].split('_')[-2] mask_fn = '%s/%s.npy' % (os.path.dirname(flowl0s[i]).replace('optical_flow', 'rigidmask'), idd) if os.path.exists(mask_fn): print(i) continue os.makedirs(os.path.dirname(mask_fn), exist_ok=True) iml0, iml1, flowl0, depthflow, intr, RT01 = exp_loader(i, iml0s, iml1s, flowl0s, disp0s, dispcs, calibs) fgmask = motionmask(flowl0, depthflow, RT01) np.save(mask_fn, fgmask) if args.debug: if args.dataset == 'Driving' and 'rigidmask/15mm_focallength/scene_forwards/fast/left' not in mask_fn: continue elif args.dataset == 'Monkaa' and 'rigidmask/eating_camera2_x2/left' not in mask_fn: continue elif args.dataset == 'FlyingThings3D' and not ('rigidmask/TEST/A' in mask_fn and 'into_future/left' in mask_fn): continue print("Visualizing %s" % mask_fn) flowl0viz = flow_to_image(flowl0) maskviz = np.stack((fgmask * 255.0, )*3, axis=-1).astype(np.uint8) inputs = np.concatenate([iml0, flowl0viz, maskviz], axis=1) cv2.imwrite('%s/%s/results_viz/%s.png' % (args.database, args.dataset, str(i).zfill(5)), cv2.cvtColor(inputs, cv2.COLOR_RGB2BGR)) ================================================ FILE: Datasets/tartanTrajFlowDataset.py ================================================ """ # ============================== # tartanTrajFlowDataset.py # library for DytanVO data I/O # Author: Wenshan Wang, Shihao Shen # Date: 3rd Jan 2023 # ============================== """ import numpy as np import cv2 from torch.utils.data import Dataset, DataLoader from os import listdir from evaluator.transformation import pos_quats2SEs, pose2motion, SEs2ses from .utils import make_intrinsics_layer class TrajFolderDataset(Dataset): """scene flow synthetic dataset. """ def __init__(self, imgfolder, transform = None, focalx = 320.0, focaly = 320.0, centerx = 320.0, centery = 240.0): files = listdir(imgfolder) self.rgbfiles = [(imgfolder +'/'+ ff) for ff in files if (ff.endswith('.png') or ff.endswith('.jpg'))] self.rgbfiles.sort() self.imgfolder = imgfolder print('Find {} image files in {}'.format(len(self.rgbfiles), imgfolder)) self.N = len(self.rgbfiles) - 1 # self.N = len(self.lines) self.transform = transform self.focalx = focalx self.focaly = focaly self.centerx = centerx self.centery = centery def __len__(self): return self.N def __getitem__(self, idx): imgfile1 = self.rgbfiles[idx].strip() imgfile2 = self.rgbfiles[idx+1].strip() img1 = cv2.imread(imgfile1) img2 = cv2.imread(imgfile2) res = {'img1': img1, 'img2': img2} h, w, _ = img1.shape intrinsicLayer = make_intrinsics_layer(w, h, self.focalx, self.focaly, self.centerx, self.centery) res['intrinsic'] = intrinsicLayer if self.transform: res = self.transform(res) res['img1_raw'] = img1 res['img2_raw'] = img2 return res ================================================ FILE: Datasets/util_flow.py ================================================ """ # ============================== # util_flow.py # library for optical flow processing # Author: Gengshan Yang # Date: 10th Feb 2021 # ============================== """ import math import png import struct import array import numpy as np import cv2 import pdb from io import * UNKNOWN_FLOW_THRESH = 1e9; UNKNOWN_FLOW = 1e10; # Middlebury checks TAG_STRING = 'PIEH' # use this when WRITING the file TAG_FLOAT = 202021.25 # check for this when READING the file def readPFM(file): import re file = open(file, 'rb') color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header == b'PF': color = True elif header == b'Pf': color = False else: raise Exception('Not a PFM file.') dim_match = re.match(b'^(\d+)\s(\d+)\s$', file.readline()) if dim_match: width, height = map(int, dim_match.groups()) else: raise Exception('Malformed PFM header.') scale = float(file.readline().rstrip()) if scale < 0: # little-endian endian = '<' scale = -scale else: endian = '>' # big-endian data = np.fromfile(file, endian + 'f') shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale def save_pfm(file, image, scale = 1): import sys color = None if image.dtype.name != 'float32': raise Exception('Image dtype must be float32.') if len(image.shape) == 3 and image.shape[2] == 3: # color image color = True elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1: # greyscale color = False else: raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.') file.write('PF\n' if color else 'Pf\n') file.write('%d %d\n' % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == '<' or endian == '=' and sys.byteorder == 'little': scale = -scale file.write('%f\n' % scale) image.tofile(file) def ReadMiddleburyFloFile(path): """ Read .FLO file as specified by Middlebury. Returns tuple (width, height, u, v, mask), where u, v, mask are flat arrays of values. """ with open(path, 'rb') as fil: tag = struct.unpack('f', fil.read(4))[0] width = struct.unpack('i', fil.read(4))[0] height = struct.unpack('i', fil.read(4))[0] assert tag == TAG_FLOAT #data = np.fromfile(path, dtype=np.float, count=-1) #data = data[3:] fmt = 'f' * width*height*2 data = struct.unpack(fmt, fil.read(4*width*height*2)) u = data[::2] v = data[1::2] mask = map(lambda x,y: abs(x) 0: # print(u[ind], v[ind], mask[ind], row[3*x], row[3*x+1], row[3*x+2]) #png_reader.close() return (width, height, u, v, mask) def WriteMiddleburyFloFile(path, width, height, u, v, mask=None): """ Write .FLO file as specified by Middlebury. """ if mask is not None: u_masked = map(lambda x,y: x if y else UNKNOWN_FLOW, u, mask) v_masked = map(lambda x,y: x if y else UNKNOWN_FLOW, v, mask) else: u_masked = u v_masked = v fmt = 'f' * width*height*2 # Interleave lists data = [x for t in zip(u_masked,v_masked) for x in t] with open(path, 'wb') as fil: fil.write(str.encode(TAG_STRING)) fil.write(struct.pack('i', width)) fil.write(struct.pack('i', height)) fil.write(struct.pack(fmt, *data)) def write_flow(path,flow): invalid_idx = (flow[:, :, 2] == 0) flow[:, :, 0:2] = flow[:, :, 0:2]*64.+ 2 ** 15 flow[invalid_idx, 0] = 0 flow[invalid_idx, 1] = 0 flow = flow.astype(np.uint16) flow = cv2.imwrite(path, flow[:,:,::-1]) #WriteKittiPngFile(path, # flow.shape[1], flow.shape[0], flow[:,:,0].flatten(), # flow[:,:,1].flatten(), flow[:,:,2].flatten()) def WriteKittiPngFile(path, width, height, u, v, mask=None): """ Write 16-bit .PNG file as specified by KITTI-2015 (flow). u, v are lists of float values mask is a list of floats, denoting the *valid* pixels. """ data = array.array('H',[0])*width*height*3 for i,(u_,v_,mask_) in enumerate(zip(u,v,mask)): data[3*i] = int(u_*64.0+2**15) data[3*i+1] = int(v_*64.0+2**15) data[3*i+2] = int(mask_) # if mask_ > 0: # print(data[3*i], data[3*i+1],data[3*i+2]) with open(path, 'wb') as png_file: png_writer = png.Writer(width=width, height=height, bitdepth=16, compression=3, greyscale=False) png_writer.write_array(png_file, data) def ConvertMiddleburyFloToKittiPng(src_path, dest_path): width, height, u, v, mask = ReadMiddleburyFloFile(src_path) WriteKittiPngFile(dest_path, width, height, u, v, mask=mask) def ConvertKittiPngToMiddleburyFlo(src_path, dest_path): width, height, u, v, mask = ReadKittiPngFile(src_path) WriteMiddleburyFloFile(dest_path, width, height, u, v, mask=mask) def ParseFilenameKitti(filename): # Parse kitti filename (seq_frameno.xx), # return seq, frameno, ext. # Be aware that seq might contain the dataset name (if contained as prefix) ext = filename[filename.rfind('.'):] frameno = filename[filename.rfind('_')+1:filename.rfind('.')] frameno = int(frameno) seq = filename[:filename.rfind('_')] return seq, frameno, ext def read_calib_file(filepath): """Read in a calibration file and parse into a dictionary.""" data = {} with open(filepath, 'r') as f: for line in f.readlines(): key, value = line.split(':', 1) # The only non-float values in these files are dates, which # we don't care about anyway try: data[key] = np.array([float(x) for x in value.split()]) except ValueError: pass return data def load_calib_cam_to_cam(cam_to_cam_file): # We'll return the camera calibration as a dictionary data = {} # Load and parse the cam-to-cam calibration data filedata = read_calib_file(cam_to_cam_file) # Create 3x4 projection matrices P_rect_00 = np.reshape(filedata['P_rect_00'], (3, 4)) P_rect_10 = np.reshape(filedata['P_rect_01'], (3, 4)) P_rect_20 = np.reshape(filedata['P_rect_02'], (3, 4)) P_rect_30 = np.reshape(filedata['P_rect_03'], (3, 4)) # Compute the camera intrinsics data['K_cam0'] = P_rect_00[0:3, 0:3] data['K_cam1'] = P_rect_10[0:3, 0:3] data['K_cam2'] = P_rect_20[0:3, 0:3] data['K_cam3'] = P_rect_30[0:3, 0:3] data['b00'] = P_rect_00[0, 3] / P_rect_00[0, 0] data['b10'] = P_rect_10[0, 3] / P_rect_10[0, 0] data['b20'] = P_rect_20[0, 3] / P_rect_20[0, 0] data['b30'] = P_rect_30[0, 3] / P_rect_30[0, 0] return data ================================================ FILE: Datasets/utils.py ================================================ """ # ============================== # utils.py # misc library for DytanVO # Author: Wenshan Wang, Shihao Shen # Date: 3rd Jan 2023 # ============================== """ from __future__ import division import torch import math import random import numpy as np import numbers import cv2 import matplotlib.pyplot as plt import os from scipy.spatial.transform import Rotation as R if ( not ( "DISPLAY" in os.environ ) ): plt.switch_backend('agg') print("Environment variable DISPLAY is not present in the system.") print("Switch the backend of matplotlib to agg.") import time # ===== general functions ===== class Compose(object): """Composes several transforms together. Args: transforms (List[Transform]): list of transforms to compose. Example: >>> transforms.Compose([ >>> transforms.CenterCrop(10), >>> transforms.ToTensor(), >>> ]) """ def __init__(self, transforms): self.transforms = transforms def __call__(self, img): for t in self.transforms: img = t(img) return img class DownscaleFlow(object): """ Scale the flow and mask to a fixed size """ def __init__(self, scale=4): ''' size: output frame size, this should be NO LARGER than the input frame size! ''' self.downscale = 1.0/scale def __call__(self, sample): if self.downscale!=1 and 'flow' in sample : sample['flow'] = cv2.resize(sample['flow'], (0, 0), fx=self.downscale, fy=self.downscale, interpolation=cv2.INTER_LINEAR) if self.downscale!=1 and 'intrinsic' in sample : sample['intrinsic'] = cv2.resize(sample['intrinsic'], (0, 0), fx=self.downscale, fy=self.downscale, interpolation=cv2.INTER_LINEAR) if self.downscale!=1 and 'fmask' in sample : sample['fmask'] = cv2.resize(sample['fmask'], (0, 0), fx=self.downscale, fy=self.downscale, interpolation=cv2.INTER_LINEAR) return sample class CropCenter(object): """Crops a sample of data (tuple) at center if the image size is not large enough, it will be first resized with fixed ratio """ def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, sample): kks = list(sample.keys()) th, tw = self.size hh, ww = sample[kks[0]].shape[0], sample[kks[0]].shape[1] if ww == tw and hh == th: return sample # resize the image if the image size is smaller than the target size scale_h = max(1, float(th)/hh) scale_w = max(1, float(tw)/ww) if scale_h>1 or scale_w>1: w = int(round(ww * scale_w)) # w after resize h = int(round(hh * scale_h)) # h after resize else: w, h = ww, hh if scale_h != 1. or scale_w != 1.: # resize the data resizedata = ResizeData(size=(h, w)) sample = resizedata(sample) x1 = int((w-tw)/2) y1 = int((h-th)/2) for kk in kks: if sample[kk] is None: continue img = sample[kk] sample[kk] = img[y1:y1+th,x1:x1+tw,...] return sample class ResizeData(object): """Resize the data in a dict """ def __init__(self, size): if isinstance(size, numbers.Number): self.size = (int(size), int(size)) else: self.size = size def __call__(self, sample): kks = list(sample.keys()) th, tw = self.size h, w = sample[kks[0]].shape[0], sample[kks[0]].shape[1] if w == tw and h == th: return sample scale_w = float(tw)/w scale_h = float(th)/h for kk in kks: if sample[kk] is None: continue sample[kk] = cv2.resize(sample[kk], (tw,th), interpolation=cv2.INTER_LINEAR) if 'flow' in sample: sample['flow'][...,0] = sample['flow'][...,0] * scale_w sample['flow'][...,1] = sample['flow'][...,1] * scale_h return sample class ToTensor(object): def __call__(self, sample): kks = list(sample) for kk in kks: data = sample[kk] data = data.astype(np.float32) if len(data.shape) == 3: # transpose image-like data data = data.transpose(2,0,1) elif len(data.shape) == 2: data = data.reshape((1,)+data.shape) # add a dummy channel if len(data.shape) == 3 and data.shape[0]==3: # normalization of rgb images data = data/255.0 sample[kk] = torch.from_numpy(data.copy()) # copy to make memory continuous return sample def tensor2img(tensImg,mean,std): """ convert a tensor a numpy array, for visualization """ # undo normalize for t, m, s in zip(tensImg, mean, std): t.mul_(s).add_(m) tensImg = tensImg * float(255) # undo transpose tensImg = (tensImg.numpy().transpose(1,2,0)).astype(np.uint8) return tensImg def bilinear_interpolate(img, h, w): # assert round(h)>=0 and round(h)=0 and round(w) seg_thresh] = 1.0 # Resize/Crop segmask (Resize + Crop + Downscale 1/4) dummysample = {'segmask': segmask} if self.resizedata is not None: dummysample = self.resizedata(dummysample) dummysample = self.cropdata(dummysample) segmask = dummysample['segmask'] segmask = cv2.resize(segmask, (0,0), fx=0.25, fy=0.25, interpolation=cv2.INTER_LINEAR) segmask = segmask[None,None,...].astype(np.float32) segmask = torch.from_numpy(np.concatenate((segmask,) * img0.shape[0], axis=0)).cuda() print("Segnet time: %.2f" % segnet_time) posenp = pose_output.data.cpu().detach().numpy().squeeze() posenp = posenp * self.pose_norm # The output is normalized during training, now scale it back flownp = flow.data.cpu().detach().numpy().squeeze() flownp = flownp * self.flow_norm # # calculate scale from GT posefile # if 'motion' in sample: # motions_gt = sample['motion'] # scale = np.linalg.norm(motions_gt[:,:3], axis=1) # trans_est = posenp[:,:3] # trans_est = trans_est/np.linalg.norm(trans_est,axis=1).reshape(-1,1)*scale.reshape(-1,1) # posenp[:,:3] = trans_est # else: # print(' scale is not given, using 1 as the default scale value..') print("\n{} Pose inference using {}s: \n{}\n".format(self.test_count, total_time, posenp)) return posenp, flownp def initialize_segnet_input(self, imgL_o, intrinsics): maxh = imgL_o.shape[0] * self.testres maxw = imgL_o.shape[1] * self.testres self.max_h = int(maxh // 64 * 64) self.max_w = int(maxw // 64 * 64) if self.max_h < maxh: self.max_h += 64 if self.max_w < maxw: self.max_w += 64 self.input_size = imgL_o.shape # modify module according to inputs for i in range(len(self.segnet.module.reg_modules)): self.segnet.module.reg_modules[i] = flow_reg([1, self.max_w//(2**(6-i)), self.max_h//(2**(6-i))], ent=getattr(self.segnet.module, 'flow_reg%d'%2**(6-i)).ent,\ maxdisp=getattr(self.segnet.module, 'flow_reg%d'%2**(6-i)).md,\ fac=getattr(self.segnet.module, 'flow_reg%d'%2**(6-i)).fac).cuda() for i in range(len(self.segnet.module.warp_modules)): self.segnet.module.warp_modules[i] = WarpModule([1, self.max_w//(2**(6-i)), self.max_h//(2**(6-i))]).cuda() # foramt intrinsics input fl, cx, cy, bl = intrinsics fl_next = fl # assuming focal length remains the same across frames self.intr_list = [torch.Tensor(inxx).cuda() for inxx in [[fl],[cx],[cy],[bl],[1],[0],[0],[1],[0],[0]]] self.intr_list.append(torch.Tensor([self.input_size[1] / self.max_w]).cuda()) # delta fx self.intr_list.append(torch.Tensor([self.input_size[0] / self.max_h]).cuda()) # delta fy self.intr_list.append(torch.Tensor([fl_next]).cuda()) def transform_segnet_input(self, imgL_o, imgR_o): imgL = cv2.resize(imgL_o, (self.max_w, self.max_h)) imgR = cv2.resize(imgR_o, (self.max_w, self.max_h)) imgL_noaug = torch.Tensor(imgL / 255.)[np.newaxis].float().cuda() # flip channel, subtract mean imgL = imgL[:,:,::-1].copy() / 255. - np.asarray(self.mean_L).mean(0)[np.newaxis,np.newaxis,:] imgR = imgR[:,:,::-1].copy() / 255. - np.asarray(self.mean_R).mean(0)[np.newaxis,np.newaxis,:] imgL = np.transpose(imgL, [2,0,1])[np.newaxis] imgR = np.transpose(imgR, [2,0,1])[np.newaxis] imgL = Variable(torch.FloatTensor(imgL).cuda()) imgR = Variable(torch.FloatTensor(imgR).cuda()) imgLR = torch.cat([imgL,imgR],0) return imgL_noaug, imgLR ================================================ FILE: LICENSE ================================================ BSD 3-Clause License Copyright (c) 2020, Air Lab Stacks All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Network/PWC/PWCNet.py ================================================ """ implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018 Jinwei Gu and Zhile Ren """ import torch import torch.nn as nn import torch.nn.functional as F import os import numpy as np from .correlation import FunctionCorrelation import cv2 # debug def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1): return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True), nn.LeakyReLU(0.1)) def predict_flow(in_planes): return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True) def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1): return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True) class PWCDCNet(nn.Module): """ PWC-DC net. add dilation convolution and densenet connections """ def __init__(self, md=4, flow_norm=20.0): """ input: md --- maximum displacement (for correlation. default: 4), after warpping """ super(PWCDCNet,self).__init__() self.flow_norm = flow_norm self.conv1a = conv(3, 16, kernel_size=3, stride=2) self.conv1aa = conv(16, 16, kernel_size=3, stride=1) self.conv1b = conv(16, 16, kernel_size=3, stride=1) self.conv2a = conv(16, 32, kernel_size=3, stride=2) self.conv2aa = conv(32, 32, kernel_size=3, stride=1) self.conv2b = conv(32, 32, kernel_size=3, stride=1) self.conv3a = conv(32, 64, kernel_size=3, stride=2) self.conv3aa = conv(64, 64, kernel_size=3, stride=1) self.conv3b = conv(64, 64, kernel_size=3, stride=1) self.conv4a = conv(64, 96, kernel_size=3, stride=2) self.conv4aa = conv(96, 96, kernel_size=3, stride=1) self.conv4b = conv(96, 96, kernel_size=3, stride=1) self.conv5a = conv(96, 128, kernel_size=3, stride=2) self.conv5aa = conv(128,128, kernel_size=3, stride=1) self.conv5b = conv(128,128, kernel_size=3, stride=1) self.conv6aa = conv(128,196, kernel_size=3, stride=2) self.conv6a = conv(196,196, kernel_size=3, stride=1) self.conv6b = conv(196,196, kernel_size=3, stride=1) # self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1) self.leakyRELU = nn.LeakyReLU(0.1) nd = (2*md+1)**2 dd = np.cumsum([128,128,96,64,32]) od = nd self.conv6_0 = conv(od, 128, kernel_size=3, stride=1) self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow6 = predict_flow(od+dd[4]) self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+128+4 self.conv5_0 = conv(od, 128, kernel_size=3, stride=1) self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow5 = predict_flow(od+dd[4]) self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+96+4 self.conv4_0 = conv(od, 128, kernel_size=3, stride=1) self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow4 = predict_flow(od+dd[4]) self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+64+4 self.conv3_0 = conv(od, 128, kernel_size=3, stride=1) self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow3 = predict_flow(od+dd[4]) self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+32+4 self.conv2_0 = conv(od, 128, kernel_size=3, stride=1) self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow2 = predict_flow(od+dd[4]) self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv7 = predict_flow(32) for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal(m.weight.data, mode='fan_in') if m.bias is not None: m.bias.data.zero_() def warp(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() # mesh grid xx = torch.arange(0, W).view(1,-1).repeat(H,1) yy = torch.arange(0, H).view(-1,1).repeat(1,W) xx = xx.view(1,1,H,W).repeat(B,1,1,1) yy = yy.view(1,1,H,W).repeat(B,1,1,1) grid = torch.cat((xx,yy),1).float() if x.is_cuda: grid = grid.cuda() vgrid = grid + flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone() / max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone() / max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) output = nn.functional.grid_sample(x, vgrid, align_corners=True) mask = torch.ones(x.size()).cuda() mask = nn.functional.grid_sample(mask, vgrid, align_corners=True) # if W==128: # np.save('mask.npy', mask.cpu().data.numpy()) # np.save('warp.npy', output.cpu().data.numpy()) mask[mask<0.9999] = 0 mask[mask>0] = 1 return output*mask def multi_scale_conv(self, conv0_func, conv1_func, conv2_func, conv3_func, conv4_func, input_feat): x = torch.cat((conv0_func(input_feat), input_feat),1) x = torch.cat((conv1_func(x), x),1) x = torch.cat((conv2_func(x), x),1) x = torch.cat((conv3_func(x), x),1) x = torch.cat((conv4_func(x), x),1) return x def concate_two_layers(self, pred_func, decon_func, upfeat_func, feat_high, feat_low1, feat_low2, scale): flow_high = pred_func(feat_high) up_flow_high = decon_func(flow_high) up_feat_high = upfeat_func(feat_high) warp_feat = self.warp(feat_low2, up_flow_high*scale) corr_low = FunctionCorrelation(tenFirst=feat_low1, tenSecond=warp_feat) corr_low = self.leakyRELU(corr_low) x = torch.cat((corr_low, feat_low1, up_flow_high, up_feat_high), 1) return x, flow_high def forward(self,x): im1 = x[:,0:3,...] im2 = x[:,3:6,...] c11 = self.conv1b(self.conv1aa(self.conv1a(im1))) c21 = self.conv1b(self.conv1aa(self.conv1a(im2))) c12 = self.conv2b(self.conv2aa(self.conv2a(c11))) c22 = self.conv2b(self.conv2aa(self.conv2a(c21))) c13 = self.conv3b(self.conv3aa(self.conv3a(c12))) c23 = self.conv3b(self.conv3aa(self.conv3a(c22))) c14 = self.conv4b(self.conv4aa(self.conv4a(c13))) c24 = self.conv4b(self.conv4aa(self.conv4a(c23))) c15 = self.conv5b(self.conv5aa(self.conv5a(c14))) c25 = self.conv5b(self.conv5aa(self.conv5a(c24))) c16 = self.conv6b(self.conv6a(self.conv6aa(c15))) c26 = self.conv6b(self.conv6a(self.conv6aa(c25))) # corr6 = self.corr(c16, c26) corr6 = FunctionCorrelation(tenFirst=c16, tenSecond=c26) corr6 = self.leakyRELU(corr6) x = self.multi_scale_conv(self.conv6_0, self.conv6_1, self.conv6_2, self.conv6_3, self.conv6_4, corr6) x, flow6 = self.concate_two_layers(self.predict_flow6, self.deconv6, self.upfeat6, x, c15, c25, 0.625) x = self.multi_scale_conv(self.conv5_0, self.conv5_1, self.conv5_2, self.conv5_3, self.conv5_4, x) x, flow5 = self.concate_two_layers(self.predict_flow5, self.deconv5, self.upfeat5, x, c14, c24, 1.25) x = self.multi_scale_conv(self.conv4_0, self.conv4_1, self.conv4_2, self.conv4_3, self.conv4_4, x) x, flow4 = self.concate_two_layers(self.predict_flow4, self.deconv4, self.upfeat4, x, c13, c23, 2.5) x = self.multi_scale_conv(self.conv3_0, self.conv3_1, self.conv3_2, self.conv3_3, self.conv3_4, x) x, flow3 = self.concate_two_layers(self.predict_flow3, self.deconv3, self.upfeat3, x, c12, c22, 5.0) x = self.multi_scale_conv(self.conv2_0, self.conv2_1, self.conv2_2, self.conv2_3, self.conv2_4, x) flow2 = self.predict_flow2(x) x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x)))) refine = self.dc_conv7(self.dc_conv6(self.dc_conv5(x))) flow2 = flow2 + refine return flow2 def pwc_dc_net(path=None): model = PWCDCNet() if path is not None: data = torch.load(path) if 'state_dict' in data.keys(): model.load_state_dict(data['state_dict']) else: model.load_state_dict(data) return model ================================================ FILE: Network/PWC/__init__.py ================================================ from .PWCNet import * ================================================ FILE: Network/PWC/correlation.py ================================================ #!/usr/bin/env python import torch import cupy import re kernel_Correlation_rearrange = ''' extern "C" __global__ void kernel_Correlation_rearrange( const int n, const float* input, float* output ) { int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; if (intIndex >= n) { return; } int intSample = blockIdx.z; int intChannel = blockIdx.y; float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex]; __syncthreads(); int intPaddedY = (intIndex / SIZE_3(input)) + 4; int intPaddedX = (intIndex % SIZE_3(input)) + 4; int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX; output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue; } ''' kernel_Correlation_updateOutput = ''' extern "C" __global__ void kernel_Correlation_updateOutput( const int n, const float* rbot0, const float* rbot1, float* top ) { extern __shared__ char patch_data_char[]; float *patch_data = (float *)patch_data_char; // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1 int x1 = blockIdx.x + 4; int y1 = blockIdx.y + 4; int item = blockIdx.z; int ch_off = threadIdx.x; // Load 3D patch into shared shared memory for (int j = 0; j < 1; j++) { // HEIGHT for (int i = 0; i < 1; i++) { // WIDTH int ji_off = (j + i) * SIZE_3(rbot0); for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch; int idxPatchData = ji_off + ch; patch_data[idxPatchData] = rbot0[idx1]; } } } __syncthreads(); __shared__ float sum[32]; // Compute correlation for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) { sum[ch_off] = 0; int s2o = top_channel % 9 - 4; int s2p = top_channel / 9 - 4; for (int j = 0; j < 1; j++) { // HEIGHT for (int i = 0; i < 1; i++) { // WIDTH int ji_off = (j + i) * SIZE_3(rbot0); for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS int x2 = x1 + s2o; int y2 = y1 + s2p; int idxPatchData = ji_off + ch; int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch; sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2]; } } } __syncthreads(); if (ch_off == 0) { float total_sum = 0; for (int idx = 0; idx < 32; idx++) { total_sum += sum[idx]; } const int sumelems = SIZE_3(rbot0); const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x; top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems; } } } ''' kernel_Correlation_updateGradFirst = ''' #define ROUND_OFF 50000 extern "C" __global__ void kernel_Correlation_updateGradFirst( const int n, const int intSample, const float* rbot0, const float* rbot1, const float* gradOutput, float* gradFirst, float* gradSecond ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { int n = intIndex % SIZE_1(gradFirst); // channels int l = (intIndex / SIZE_1(gradFirst)) % SIZE_3(gradFirst) + 4; // w-pos int m = (intIndex / SIZE_1(gradFirst) / SIZE_3(gradFirst)) % SIZE_2(gradFirst) + 4; // h-pos // round_off is a trick to enable integer division with ceil, even for negative numbers // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = round_off; // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) // Same here: int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4) int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4) float sum = 0; if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { xmin = max(0,xmin); xmax = min(SIZE_3(gradOutput)-1,xmax); ymin = max(0,ymin); ymax = min(SIZE_2(gradOutput)-1,ymax); for (int p = -4; p <= 4; p++) { for (int o = -4; o <= 4; o++) { // Get rbot1 data: int s2o = o; int s2p = p; int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n; float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n] // Index offset for gradOutput in following loops: int op = (p+4) * 9 + (o+4); // index[o,p] int idxopoffset = (intSample * SIZE_1(gradOutput) + op); for (int y = ymin; y <= ymax; y++) { for (int x = xmin; x <= xmax; x++) { int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] sum += gradOutput[idxgradOutput] * bot1tmp; } } } } } const int sumelems = SIZE_1(gradFirst); const int bot0index = ((n * SIZE_2(gradFirst)) + (m-4)) * SIZE_3(gradFirst) + (l-4); gradFirst[bot0index + intSample*SIZE_1(gradFirst)*SIZE_2(gradFirst)*SIZE_3(gradFirst)] = sum / (float)sumelems; } } ''' kernel_Correlation_updateGradSecond = ''' #define ROUND_OFF 50000 extern "C" __global__ void kernel_Correlation_updateGradSecond( const int n, const int intSample, const float* rbot0, const float* rbot1, const float* gradOutput, float* gradFirst, float* gradSecond ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { int n = intIndex % SIZE_1(gradSecond); // channels int l = (intIndex / SIZE_1(gradSecond)) % SIZE_3(gradSecond) + 4; // w-pos int m = (intIndex / SIZE_1(gradSecond) / SIZE_3(gradSecond)) % SIZE_2(gradSecond) + 4; // h-pos // round_off is a trick to enable integer division with ceil, even for negative numbers // We use a large offset, for the inner part not to become negative. const int round_off = ROUND_OFF; const int round_off_s1 = round_off; float sum = 0; for (int p = -4; p <= 4; p++) { for (int o = -4; o <= 4; o++) { int s2o = o; int s2p = p; //Get X,Y ranges and clamp // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) // Same here: int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o) int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p) if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { xmin = max(0,xmin); xmax = min(SIZE_3(gradOutput)-1,xmax); ymin = max(0,ymin); ymax = min(SIZE_2(gradOutput)-1,ymax); // Get rbot0 data: int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n; float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n] // Index offset for gradOutput in following loops: int op = (p+4) * 9 + (o+4); // index[o,p] int idxopoffset = (intSample * SIZE_1(gradOutput) + op); for (int y = ymin; y <= ymax; y++) { for (int x = xmin; x <= xmax; x++) { int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] sum += gradOutput[idxgradOutput] * bot0tmp; } } } } } const int sumelems = SIZE_1(gradSecond); const int bot1index = ((n * SIZE_2(gradSecond)) + (m-4)) * SIZE_3(gradSecond) + (l-4); gradSecond[bot1index + intSample*SIZE_1(gradSecond)*SIZE_2(gradSecond)*SIZE_3(gradSecond)] = sum / (float)sumelems; } } ''' def cupy_kernel(strFunction, objVariables): strKernel = globals()[strFunction] while True: objMatch = re.search('(SIZE_)([0-4])(\()([^\)]*)(\))', strKernel) if objMatch is None: break # end intArg = int(objMatch.group(2)) strTensor = objMatch.group(4) intSizes = objVariables[strTensor].size() strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg])) # end while True: objMatch = re.search('(VALUE_)([0-4])(\()([^\)]+)(\))', strKernel) if objMatch is None: break # end intArgs = int(objMatch.group(2)) strArgs = objMatch.group(4).split(',') strTensor = strArgs[0] intStrides = objVariables[strTensor].stride() strIndex = [ '((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg]) + ')' for intArg in range(intArgs) ] strKernel = strKernel.replace(objMatch.group(0), strTensor + '[' + str.join('+', strIndex) + ']') # end return strKernel # end # @cupy.util.memoize(for_each_device=True) def cupy_launch(strFunction, strKernel): return cupy.cuda.compile_with_cache(strKernel).get_function(strFunction) # end class _FunctionCorrelation(torch.autograd.Function): @staticmethod def forward(self, first, second): rbot0 = first.new_zeros([ first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1] ]) rbot1 = first.new_zeros([ first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1] ]) self.save_for_backward(first, second, rbot0, rbot1) assert(first.is_contiguous() == True) assert(second.is_contiguous() == True) output = first.new_zeros([ first.shape[0], 81, first.shape[2], first.shape[3] ]) if first.is_cuda == True: n = first.shape[2] * first.shape[3] cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { 'input': first, 'output': rbot0 }))( grid=tuple([ int((n + 16 - 1) / 16), first.shape[1], first.shape[0] ]), block=tuple([ 16, 1, 1 ]), args=[ n, first.data_ptr(), rbot0.data_ptr() ] ) n = second.shape[2] * second.shape[3] cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { 'input': second, 'output': rbot1 }))( grid=tuple([ int((n + 16 - 1) / 16), second.shape[1], second.shape[0] ]), block=tuple([ 16, 1, 1 ]), args=[ n, second.data_ptr(), rbot1.data_ptr() ] ) n = output.shape[1] * output.shape[2] * output.shape[3] cupy_launch('kernel_Correlation_updateOutput', cupy_kernel('kernel_Correlation_updateOutput', { 'rbot0': rbot0, 'rbot1': rbot1, 'top': output }))( grid=tuple([ output.shape[3], output.shape[2], output.shape[0] ]), block=tuple([ 32, 1, 1 ]), shared_mem=first.shape[1] * 4, args=[ n, rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr() ] ) elif first.is_cuda == False: raise NotImplementedError() # end return output # end @staticmethod def backward(self, gradOutput): first, second, rbot0, rbot1 = self.saved_tensors assert(gradOutput.is_contiguous() == True) gradFirst = first.new_zeros([ first.shape[0], first.shape[1], first.shape[2], first.shape[3] ]) if self.needs_input_grad[0] == True else None gradSecond = first.new_zeros([ first.shape[0], first.shape[1], first.shape[2], first.shape[3] ]) if self.needs_input_grad[1] == True else None if first.is_cuda == True: if gradFirst is not None: for intSample in range(first.shape[0]): n = first.shape[1] * first.shape[2] * first.shape[3] cupy_launch('kernel_Correlation_updateGradFirst', cupy_kernel('kernel_Correlation_updateGradFirst', { 'rbot0': rbot0, 'rbot1': rbot1, 'gradOutput': gradOutput, 'gradFirst': gradFirst, 'gradSecond': None }))( grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), block=tuple([ 512, 1, 1 ]), args=[ n, intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), gradFirst.data_ptr(), None ] ) # end # end if gradSecond is not None: for intSample in range(first.shape[0]): n = first.shape[1] * first.shape[2] * first.shape[3] cupy_launch('kernel_Correlation_updateGradSecond', cupy_kernel('kernel_Correlation_updateGradSecond', { 'rbot0': rbot0, 'rbot1': rbot1, 'gradOutput': gradOutput, 'gradFirst': None, 'gradSecond': gradSecond }))( grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), block=tuple([ 512, 1, 1 ]), args=[ n, intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), None, gradSecond.data_ptr() ] ) # end # end elif first.is_cuda == False: raise NotImplementedError() # end return gradFirst, gradSecond # end # end def FunctionCorrelation(tenFirst, tenSecond): return _FunctionCorrelation.apply(tenFirst, tenSecond) # end class ModuleCorrelation(torch.nn.Module): def __init__(self): super(ModuleCorrelation, self).__init__() # end def forward(self, tenFirst, tenSecond): return _FunctionCorrelation.apply(tenFirst, tenSecond) # end # end ================================================ FILE: Network/VOFlowNet.py ================================================ # Software License Agreement (BSD License) # # Copyright (c) 2020, Wenshan Wang, CMU # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of CMU nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import torch import torch.nn as nn import torch.nn.functional as F import math def conv(in_planes, out_planes, kernel_size=3, stride=2, padding=1, dilation=1, bn_layer=False, bias=True): if bn_layer: return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, padding=padding, stride=stride, dilation=dilation, bias=bias), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True) ) else: return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, padding=padding, stride=stride, dilation=dilation), nn.ReLU(inplace=True) ) def linear(in_planes, out_planes): return nn.Sequential( nn.Linear(in_planes, out_planes), nn.ReLU(inplace=True) ) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride, downsample, pad, dilation): super(BasicBlock, self).__init__() self.conv1 = conv(inplanes, planes, 3, stride, pad, dilation) self.conv2 = nn.Conv2d(planes, planes, 3, 1, pad, dilation) self.downsample = downsample self.stride = stride def forward(self, x): out = self.conv1(x) out = self.conv2(out) if self.downsample is not None: x = self.downsample(x) out += x return F.relu(out, inplace=True) class VOFlowRes(nn.Module): def __init__(self): super(VOFlowRes, self).__init__() inputnum = 5 blocknums = [2,2,3,4,6,7,3] outputnums = [32,64,64,128,128,256,256] self.firstconv = nn.Sequential(conv(inputnum, 32, 3, 2, 1, 1, False), conv(32, 32, 3, 1, 1, 1), conv(32, 32, 3, 1, 1, 1)) self.inplanes = 32 self.layer1 = self._make_layer(BasicBlock, outputnums[2], blocknums[2], 2, 1, 1) # 40 x 28 self.layer2 = self._make_layer(BasicBlock, outputnums[3], blocknums[3], 2, 1, 1) # 20 x 14 self.layer3 = self._make_layer(BasicBlock, outputnums[4], blocknums[4], 2, 1, 1) # 10 x 7 self.layer4 = self._make_layer(BasicBlock, outputnums[5], blocknums[5], 2, 1, 1) # 5 x 4 self.layer5 = self._make_layer(BasicBlock, outputnums[6], blocknums[6], 2, 1, 1) # 3 x 2 fcnum = outputnums[6] * 6 fc1_trans = linear(fcnum, 128) fc2_trans = linear(128,32) fc3_trans = nn.Linear(32,3) fc1_rot = linear(fcnum, 128) fc2_rot = linear(128,32) fc3_rot = nn.Linear(32,3) self.voflow_trans = nn.Sequential(fc1_trans, fc2_trans, fc3_trans) self.voflow_rot = nn.Sequential(fc1_rot, fc2_rot, fc3_rot) def _make_layer(self, block, planes, blocks, stride, pad, dilation): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, pad, dilation)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes,1,None,pad,dilation)) return nn.Sequential(*layers) def forward(self, x): x = self.firstconv(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) x = x.view(x.shape[0], -1) x_trans = self.voflow_trans(x) x_rot = self.voflow_rot(x) return torch.cat((x_trans, x_rot), dim=1) ================================================ FILE: Network/VONet.py ================================================ # Software License Agreement (BSD License) # # Copyright (c) 2020, Wenshan Wang, Yaoyu Hu, CMU # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of CMU nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import torch import torch.nn as nn import torch.nn.functional as F from .PWC import PWCDCNet as FlowNet from .VOFlowNet import VOFlowRes as FlowPoseNet class VONet(nn.Module): def __init__(self): super(VONet, self).__init__() self.flowNet = FlowNet() self.flowPoseNet = FlowPoseNet() def forward(self, x, only_flow=False, only_pose=False): ''' x[0]: rgb frame t-1 x[1]: rgb frame t x[2]: intrinsics x[3]: flow t-1 -> t (optional) x[4]: motion segmentation mask ''' # import ipdb;ipdb.set_trace() if not only_pose: flow_out = self.flowNet(torch.cat((x[0], x[1]), dim=1)) if only_flow: return flow_out, None flow = flow_out[0] else: assert(len(x) > 3) flow_out = None if len(x) > 3 and x[3] is not None: flow_input = x[3] else: flow_input = flow # Mask out input flow using the segmentation result assert(len(x) > 4) mask = torch.gt(x[4], 0) for i in range(flow_input.shape[0]): zeros = torch.cat([mask[i], ]*2, dim=0) flow_input[i][zeros] = 0 flow_input = torch.cat((flow_input, 1 - x[4]), dim=1) # segmentation layer flow_input = torch.cat((flow_input, x[2]), dim=1) # intrinsics layer pose = self.flowPoseNet(flow_input) return flow_out, pose ================================================ FILE: Network/__init__.py ================================================ ================================================ FILE: Network/rigidmask/.gitignore ================================================ __pycache__ ================================================ FILE: Network/rigidmask/VCNplus.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable import numpy as np import math import pdb import cv2 import kornia from .submodule import pspnet, bfmodule, bfmodule_feat, conv, compute_geo_costs, get_skew_mat, get_intrinsics from .conv4d import sepConv4d, butterfly4D from scipy.spatial.transform import Rotation from .det import create_model class flow_reg(nn.Module): """ Soft winner-take-all that selects the most likely diplacement. Set ent=True to enable entropy output. Set maxdisp to adjust maximum allowed displacement towards one side. maxdisp=4 searches for a 9x9 region. Set fac to squeeze search window. maxdisp=4 and fac=2 gives search window of 9x5 """ def __init__(self, size, ent=False, maxdisp = int(4), fac=1): B,W,H = size super(flow_reg, self).__init__() self.ent = ent self.md = maxdisp self.fac = fac self.truncated = True self.wsize = 3 # by default using truncation 7x7 flowrangey = range(-maxdisp,maxdisp+1) flowrangex = range(-int(maxdisp//self.fac),int(maxdisp//self.fac)+1) meshgrid = np.meshgrid(flowrangex,flowrangey) flowy = np.tile( np.reshape(meshgrid[0],[1,2*maxdisp+1,2*int(maxdisp//self.fac)+1,1,1]), (B,1,1,H,W) ) flowx = np.tile( np.reshape(meshgrid[1],[1,2*maxdisp+1,2*int(maxdisp//self.fac)+1,1,1]), (B,1,1,H,W) ) self.register_buffer('flowx',torch.Tensor(flowx)) self.register_buffer('flowy',torch.Tensor(flowy)) self.pool3d = nn.MaxPool3d((self.wsize*2+1,self.wsize*2+1,1),stride=1,padding=(self.wsize,self.wsize,0)) def forward(self, x): b,u,v,h,w = x.shape oldx = x if self.truncated: # truncated softmax x = x.view(b,u*v,h,w) idx = x.argmax(1)[:,np.newaxis] if x.is_cuda: mask = Variable(torch.cuda.HalfTensor(b,u*v,h,w)).fill_(0) else: mask = Variable(torch.FloatTensor(b,u*v,h,w)).fill_(0) mask.scatter_(1,idx,1) mask = mask.view(b,1,u,v,-1) mask = self.pool3d(mask)[:,0].view(b,u,v,h,w) ninf = x.clone().fill_(-np.inf).view(b,u,v,h,w) x = torch.where(mask.byte(),oldx,ninf) else: self.wsize = (np.sqrt(u*v)-1)/2 b,u,v,h,w = x.shape x = F.softmax(x.view(b,-1,h,w),1).view(b,u,v,h,w) if np.isnan(x.min().detach().cpu()): #pdb.set_trace() x[torch.isnan(x)] = F.softmax(oldx[torch.isnan(x)]) outx = torch.sum(torch.sum(x*self.flowx,1),1,keepdim=True) outy = torch.sum(torch.sum(x*self.flowy,1),1,keepdim=True) if self.ent: # local local_entropy = (-x*torch.clamp(x,1e-9,1-1e-9).log()).sum(1).sum(1)[:,np.newaxis] if self.wsize == 0: local_entropy[:] = 1. else: local_entropy /= np.log((self.wsize*2+1)**2) # global x = F.softmax(oldx.view(b,-1,h,w),1).view(b,u,v,h,w) global_entropy = (-x*torch.clamp(x,1e-9,1-1e-9).log()).sum(1).sum(1)[:,np.newaxis] global_entropy /= np.log(x.shape[1]*x.shape[2]) return torch.cat([outx,outy],1),torch.cat([local_entropy, global_entropy],1) else: return torch.cat([outx,outy],1),None class WarpModule(nn.Module): """ taken from https://github.com/NVlabs/PWC-Net/blob/master/PyTorch/models/PWCNet.py """ def __init__(self, size): super(WarpModule, self).__init__() B,W,H = size # mesh grid xx = torch.arange(0, W).view(1,-1).repeat(H,1) yy = torch.arange(0, H).view(-1,1).repeat(1,W) xx = xx.view(1,1,H,W).repeat(B,1,1,1) yy = yy.view(1,1,H,W).repeat(B,1,1,1) self.register_buffer('grid',torch.cat((xx,yy),1).float()) def forward(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() vgrid = self.grid + flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) #output = nn.functional.grid_sample(x, vgrid) output = nn.functional.grid_sample(x, vgrid, align_corners=True) mask = ((vgrid[:,:,:,0].abs()<1) * (vgrid[:,:,:,1].abs()<1)) >0 return output*mask.unsqueeze(1).float(), mask def get_grid(B,H,W): meshgrid_base = np.meshgrid(range(0,W), range(0,H))[::-1] basey = np.reshape(meshgrid_base[0],[1,1,1,H,W]) basex = np.reshape(meshgrid_base[1],[1,1,1,H,W]) grid = torch.tensor(np.concatenate((basex.reshape((-1,H,W,1)),basey.reshape((-1,H,W,1))),-1)).cuda().float() return grid.view(1,1,H,W,2) class SegNet(nn.Module): """ Motion Segmentation Network """ def __init__(self, size, md=[4,4,4,4,4], fac=1., exp_unc=True): super(SegNet,self).__init__() self.md = md self.fac = fac use_entropy = True withbn = True ## pspnet self.pspnet = pspnet(is_proj=False) ### Volumetric-UNet fdima1 = 128 # 6/5/4 fdima2 = 64 # 3/2 fdimb1 = 16 # 6/5/4/3 fdimb2 = 12 # 2 full=False self.f6 = butterfly4D(fdima1, fdimb1,withbn=withbn,full=full) self.p6 = sepConv4d(fdimb1,fdimb1, with_bn=False, full=full) self.f5 = butterfly4D(fdima1, fdimb1,withbn=withbn, full=full) self.p5 = sepConv4d(fdimb1,fdimb1, with_bn=False,full=full) self.f4 = butterfly4D(fdima1, fdimb1,withbn=withbn,full=full) self.p4 = sepConv4d(fdimb1,fdimb1, with_bn=False,full=full) self.f3 = butterfly4D(fdima2, fdimb1,withbn=withbn,full=full) self.p3 = sepConv4d(fdimb1,fdimb1, with_bn=False,full=full) full=True self.f2 = butterfly4D(fdima2, fdimb2,withbn=withbn,full=full) self.p2 = sepConv4d(fdimb2,fdimb2, with_bn=False,full=full) self.flow_reg64 = flow_reg([fdimb1*size[0],size[1]//64,size[2]//64], ent=use_entropy, maxdisp=self.md[0], fac=self.fac) self.flow_reg32 = flow_reg([fdimb1*size[0],size[1]//32,size[2]//32], ent=use_entropy, maxdisp=self.md[1]) self.flow_reg16 = flow_reg([fdimb1*size[0],size[1]//16,size[2]//16], ent=use_entropy, maxdisp=self.md[2]) self.flow_reg8 = flow_reg([fdimb1*size[0],size[1]//8,size[2]//8] , ent=use_entropy, maxdisp=self.md[3]) self.flow_reg4 = flow_reg([fdimb2*size[0],size[1]//4,size[2]//4] , ent=use_entropy, maxdisp=self.md[4]) self.warp5 = WarpModule([size[0],size[1]//32,size[2]//32]) self.warp4 = WarpModule([size[0],size[1]//16,size[2]//16]) self.warp3 = WarpModule([size[0],size[1]//8,size[2]//8]) self.warp2 = WarpModule([size[0],size[1]//4,size[2]//4]) if self.training: self.warpx = WarpModule([size[0],size[1],size[2]]) ## hypotheses fusion modules, adopted from the refinement module of PWCNet # https://github.com/NVlabs/PWC-Net/blob/master/PyTorch/models/PWCNet.py # c6 self.dc6_conv1 = conv(128+4*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc6_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc6_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc6_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc6_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc6_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc6_conv7 = nn.Conv2d(32,2*fdimb1,kernel_size=3,stride=1,padding=1,bias=True) # c5 self.dc5_conv1 = conv(128+4*fdimb1*2, 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc5_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc5_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc5_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc5_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc5_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc5_conv7 = nn.Conv2d(32,2*fdimb1*2,kernel_size=3,stride=1,padding=1,bias=True) # c4 self.dc4_conv1 = conv(128+4*fdimb1*3, 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc4_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc4_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc4_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc4_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc4_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc4_conv7 = nn.Conv2d(32,2*fdimb1*3,kernel_size=3,stride=1,padding=1,bias=True) # c3 self.dc3_conv1 = conv(64+16*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc3_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc3_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc3_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc3_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc3_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc3_conv7 = nn.Conv2d(32,8*fdimb1,kernel_size=3,stride=1,padding=1,bias=True) # c2 self.dc2_conv1 = conv(64+16*fdimb1+4*fdimb2, 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc2_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc2_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc2_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc2_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc2_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc2_conv7 = nn.Conv2d(32,4*2*fdimb1 + 2*fdimb2,kernel_size=3,stride=1,padding=1,bias=True) self.dc6_conv = nn.Sequential( self.dc6_conv1, self.dc6_conv2, self.dc6_conv3, self.dc6_conv4, self.dc6_conv5, self.dc6_conv6, self.dc6_conv7) self.dc5_conv = nn.Sequential( self.dc5_conv1, self.dc5_conv2, self.dc5_conv3, self.dc5_conv4, self.dc5_conv5, self.dc5_conv6, self.dc5_conv7) self.dc4_conv = nn.Sequential( self.dc4_conv1, self.dc4_conv2, self.dc4_conv3, self.dc4_conv4, self.dc4_conv5, self.dc4_conv6, self.dc4_conv7) self.dc3_conv = nn.Sequential( self.dc3_conv1, self.dc3_conv2, self.dc3_conv3, self.dc3_conv4, self.dc3_conv5, self.dc3_conv6, self.dc3_conv7) self.dc2_conv = nn.Sequential( self.dc2_conv1, self.dc2_conv2, self.dc2_conv3, self.dc2_conv4, self.dc2_conv5, self.dc2_conv6, self.dc2_conv7) ## Out-of-range detection self.dc6_convo = nn.Sequential(conv(128+4*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1), conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2), conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4), conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8), conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16), conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1), nn.Conv2d(32,1,kernel_size=3,stride=1,padding=1,bias=True)) self.dc5_convo = nn.Sequential(conv(128+2*4*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1), conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2), conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4), conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8), conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16), conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1), nn.Conv2d(32,1,kernel_size=3,stride=1,padding=1,bias=True)) self.dc4_convo = nn.Sequential(conv(128+3*4*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1), conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2), conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4), conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8), conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16), conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1), nn.Conv2d(32,1,kernel_size=3,stride=1,padding=1,bias=True)) self.dc3_convo = nn.Sequential(conv(64+16*fdimb1, 128, kernel_size=3, stride=1, padding=1, dilation=1), conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2), conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4), conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8), conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16), conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1), nn.Conv2d(32,1,kernel_size=3,stride=1,padding=1,bias=True)) self.dc2_convo = nn.Sequential(conv(64+16*fdimb1+4*fdimb2, 128, kernel_size=3, stride=1, padding=1, dilation=1), conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2), conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4), conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8), conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16), conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1), nn.Conv2d(32,1,kernel_size=3,stride=1,padding=1,bias=True)) # affine-exp self.f3d2v1 = conv(64, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2v2 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2v3 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2v4 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2v5 = conv(64, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2v6 = conv(12*81, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.f3d2 = bfmodule(128-64,1) # depth change net self.dcnetv1 = conv(64, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.dcnetv2 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.dcnetv3 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.dcnetv4 = conv(1, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.dcnetv5 = conv(12*81, 32, kernel_size=3, stride=1, padding=1,dilation=1) # self.dcnetv6 = conv(4, 32, kernel_size=3, stride=1, padding=1,dilation=1) # if exp_unc: self.dcnet = bfmodule(128,2) else: self.dcnet = bfmodule(128,1) # moseg net self.fgnetv1 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv2 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv3 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv4 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv5 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv6 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv7 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv8 = conv(1, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv9 = conv(3, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnetv10 = conv(3, 16, kernel_size=3, stride=1, padding=1,dilation=1) # self.fgnet = bfmodule_feat(208-3*16,7) #from midas.midas_net import MidasNet #self.midas = MidasNet('/data/gengshay/midas.pt', non_negative=True) self.midas = torch.hub.load("intel-isl/MiDaS", "MiDaS") # detection branch self.det = create_model('dla_34', {'hm': 2, 'wh': 36}, 256,num_input=14) for m in self.modules(): if isinstance(m, nn.Conv3d): n = m.kernel_size[0] * m.kernel_size[1]*m.kernel_size[2] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if hasattr(m.bias,'data'): m.bias.data.zero_() self.facs = [self.fac,1,1,1,1] self.warp_modules = nn.ModuleList([None, self.warp5, self.warp4, self.warp3, self.warp2]) self.f_modules = nn.ModuleList([self.f6, self.f5, self.f4, self.f3, self.f2]) self.p_modules = nn.ModuleList([self.p6, self.p5, self.p4, self.p3, self.p2]) self.reg_modules = nn.ModuleList([self.flow_reg64, self.flow_reg32, self.flow_reg16, self.flow_reg8, self.flow_reg4]) self.oor_modules = nn.ModuleList([self.dc6_convo, self.dc5_convo, self.dc4_convo, self.dc3_convo, self.dc2_convo]) self.fuse_modules = nn.ModuleList([self.dc6_conv, self.dc5_conv, self.dc4_conv, self.dc3_conv, self.dc2_conv]) def corrf(self, refimg_fea, targetimg_fea,maxdisp, fac=1): if self.training: #fast correlation function b,c,h,w = refimg_fea.shape targetimg_fea = F.unfold(targetimg_fea, (2*int(maxdisp)//fac+1,2*maxdisp+1), padding=(int(maxdisp)//fac,maxdisp)).view(b,c, 2*int(maxdisp)//fac+1,2*maxdisp+1,h,w).permute(0,1,3,2,4,5).contiguous() cost = refimg_fea.view(b,c,h,w)[:,:,np.newaxis, np.newaxis]*targetimg_fea cost = F.leaky_relu(cost, 0.1,inplace=True) else: #slow correlation function b,c,height,width = refimg_fea.shape if refimg_fea.is_cuda: cost = Variable(torch.cuda.FloatTensor(b,c,2*maxdisp+1,2*int(maxdisp//fac)+1,height,width)).fill_(0.) # b,c,u,v,h,w else: cost = Variable(torch.FloatTensor(b,c,2*maxdisp+1,2*int(maxdisp//fac)+1,height,width)).fill_(0.) # b,c,u,v,h,w for i in range(2*maxdisp+1): ind = i-maxdisp for j in range(2*int(maxdisp//fac)+1): indd = j-int(maxdisp//fac) feata = refimg_fea[:,:,max(0,-indd):height-indd,max(0,-ind):width-ind] featb = targetimg_fea[:,:,max(0,+indd):height+indd,max(0,ind):width+ind] diff = (feata*featb) cost[:, :, i,j,max(0,-indd):height-indd,max(0,-ind):width-ind] = diff # standard cost = F.leaky_relu(cost, 0.1,inplace=True) return cost def cost_matching(self,up_flow, c1, c2, flowh, enth, level): """ up_flow: upsample coarse flow c1: normalized feature of image 1 c2: normalized feature of image 2 flowh: flow hypotheses enth: entropy oor: out of range score for flow """ # normalize c1n = c1 / (c1.norm(dim=1, keepdim=True)+1e-9) c2n = c2 / (c2.norm(dim=1, keepdim=True)+1e-9) # cost volume if level == 0: warp = c2n else: warp,_ = self.warp_modules[level](c2n, up_flow) feat = self.corrf(c1n,warp,self.md[level],fac=self.facs[level]) feat = self.f_modules[level](feat) cost = self.p_modules[level](feat) # b, 16, u,v,h,w # soft WTA b,c,u,v,h,w = cost.shape cost = cost.view(-1,u,v,h,w) # bx16, 9,9,h,w, also predict uncertainty from here flowhh,enthh = self.reg_modules[level](cost) # bx16, 2, h, w flowhh = flowhh.view(b,c,2,h,w) if level > 0: flowhh = flowhh + up_flow[:,np.newaxis] flowhh = flowhh.view(b,-1,h,w) # b, 16*2, h, w enthh = enthh.view(b,-1,h,w) # b, 16*1, h, w # append coarse hypotheses if level == 0: flowh = flowhh enth = enthh else: flowh = torch.cat((flowhh, F.upsample(flowh.detach()*2, [flowhh.shape[2],flowhh.shape[3]], mode='bilinear')),1) # b, k2--k2, h, w enth = torch.cat((enthh, F.upsample(enth, [flowhh.shape[2],flowhh.shape[3]], mode='bilinear')),1) if self.training or level==4: x = torch.cat((enth.detach(), flowh.detach(), c1),1) oor = self.oor_modules[level](x)[:,0] else: oor = None # hypotheses fusion x = torch.cat((enth.detach(), flowh.detach(), c1),1) va = self.fuse_modules[level](x) va = va.view(b,-1,2,h,w) flow = ( flowh.view(b,-1,2,h,w) * F.softmax(va,1) ).sum(1) # b, 2k, 2, h, w return flow, flowh, enth, oor def affine(self,pref,flow, pw=1): b,_,lh,lw=flow.shape ptar = pref + flow pw = 1 pref = F.unfold(pref, (pw*2+1,pw*2+1), padding=(pw)).view(b,2,(pw*2+1)**2,lh,lw)-pref[:,:,np.newaxis] ptar = F.unfold(ptar, (pw*2+1,pw*2+1), padding=(pw)).view(b,2,(pw*2+1)**2,lh,lw)-ptar[:,:,np.newaxis] # b, 2,9,h,w pref = pref.permute(0,3,4,1,2).reshape(b*lh*lw,2,(pw*2+1)**2) ptar = ptar.permute(0,3,4,1,2).reshape(b*lh*lw,2,(pw*2+1)**2) prefprefT = pref.matmul(pref.permute(0,2,1)) ppdet = prefprefT[:,0,0]*prefprefT[:,1,1]-prefprefT[:,1,0]*prefprefT[:,0,1] ppinv = torch.cat((prefprefT[:,1,1:],-prefprefT[:,0,1:], -prefprefT[:,1:,0], prefprefT[:,0:1,0]),1).view(-1,2,2)/ppdet.clamp(1e-10,np.inf)[:,np.newaxis,np.newaxis] Affine = ptar.matmul(pref.permute(0,2,1)).matmul(ppinv) Error = (Affine.matmul(pref)-ptar).norm(2,1).mean(1).view(b,1,lh,lw) Avol = (Affine[:,0,0]*Affine[:,1,1]-Affine[:,1,0]*Affine[:,0,1]).view(b,1,lh,lw).abs().clamp(1e-10,np.inf) exp = Avol.sqrt() mask = (exp>0.5) & (exp<2) & (Error<0.1) mask = mask[:,0] exp = exp.clamp(0.5,2) exp[Error>0.1]=1 return exp, Error, mask def forward_VCN(self, im): bs = im.shape[0]//2 ### compute optical flow c06,c05,c04,c03,c02 = self.pspnet(im) c16 = c06[:bs]; c26 = c06[bs:] c15 = c05[:bs]; c25 = c05[bs:] c14 = c04[:bs]; c24 = c04[bs:] c13 = c03[:bs]; c23 = c03[bs:] c12 = c02[:bs]; c22 = c02[bs:] ## matching 6 flow6, flow6h, ent6h, oor6 = self.cost_matching(None, c16, c26, None, None,level=0) ## matching 5 up_flow6 = F.upsample(flow6, [im.size()[2]//32,im.size()[3]//32], mode='bilinear')*2 flow5, flow5h, ent5h, oor5 = self.cost_matching(up_flow6, c15, c25, flow6h, ent6h,level=1) ## matching 4 up_flow5 = F.upsample(flow5, [im.size()[2]//16,im.size()[3]//16], mode='bilinear')*2 flow4, flow4h, ent4h, oor4 = self.cost_matching(up_flow5, c14, c24, flow5h, ent5h,level=2) ## matching 3 up_flow4 = F.upsample(flow4, [im.size()[2]//8,im.size()[3]//8], mode='bilinear')*2 flow3, flow3h, ent3h, oor3 = self.cost_matching(up_flow4, c13, c23, flow4h, ent4h,level=3) ## matching 2 up_flow3 = F.upsample(flow3, [im.size()[2]//4,im.size()[3]//4], mode='bilinear')*2 flow2, flow2h, ent2h, oor2 = self.cost_matching(up_flow3, c12, c22, flow3h, ent3h,level=4) ### optical expansion b,_,h,w = flow2.shape exp2,err2,_ = self.affine(get_grid(b,h,w)[:,0].permute(0,3,1,2).repeat(b,1,1,1).clone(), flow2.detach(),pw=1) x = torch.cat(( self.f3d2v2(-exp2.log()), self.f3d2v3(err2), ),1) dchange2 = -exp2.log()+1./200*self.f3d2(x)[0] # depth change net iexp2 = F.upsample(dchange2.clone(), [im.size()[2],im.size()[3]], mode='bilinear') x = torch.cat((self.dcnetv1(c12.detach()), self.dcnetv2(dchange2.detach()), self.dcnetv3(-exp2.log()), self.dcnetv4(err2), ),1) dcneto = 1./200*self.dcnet(x)[0] dchange2 = dchange2.detach() + dcneto[:,:1] dchange2 = F.upsample(dchange2, [im.size()[2],im.size()[3]], mode='bilinear') if dcneto.shape[1]>1: dc_unc = dcneto[:,1:2] else: dc_unc = torch.zeros_like(dcneto) dc_unc = F.upsample(dc_unc, [im.size()[2],im.size()[3]], mode='bilinear')[:,0] return flow2, oor2, dchange2, dc_unc def forward(self,im,disc_aux=None,flowdc=None): bs = im.shape[0]//2 flow2, oor2, dchange2, dc_unc = flowdc ### rigid motion segmentation ## pre-processing Kinv, Kinv_n = get_intrinsics(disc_aux[0], noise=False) # get full res flow/expansion inputs H,W = im.size()[2:4] flow = 4*F.upsample(flow2, [H,W], mode='bilinear').detach() oor2 = F.upsample(oor2[:,np.newaxis], [H,W], mode='bilinear').detach()[:,0] tau = (-dchange2[:,0]).exp().detach() # use different number of correspondences for bg, obj segmentation and pose fscale=128./H; fscalex=32./H;fscaled=448./H hp0o = torch.cat( [torch.arange(0, W,out=torch.cuda.FloatTensor()).view(1,-1).repeat(H,1)[np.newaxis], # 1,2,H,W torch.arange(0, H,out=torch.cuda.FloatTensor()).view(-1,1).repeat(1,W)[np.newaxis]], 0)[np.newaxis] hp1o = hp0o + flow # b,2,H,W # to deal with input resizing (TODO: move it inside intrinsics) hp0o[:,0] *= disc_aux[0][10] hp0o[:,1] *= disc_aux[0][11] hp1o[:,0] *= disc_aux[0][10] hp1o[:,1] *= disc_aux[0][11] # sample correspondence for object segmentation (fscaled) hp0d = F.interpolate(hp0o,scale_factor=fscaled,mode='nearest') hp1d = F.interpolate(hp1o,scale_factor=fscaled,mode='nearest') _,_,hd,wd=hp0d.shape hp0d = hp0d.view(1,2,-1).permute(0,2,1) hp1d = hp1d.view(bs,2,-1).permute(0,2,1) hp0d = torch.cat((hp0d,torch.ones(1,hp0d.shape[1],1).cuda()),-1) hp1d = torch.cat((hp1d,torch.ones(bs,hp0d.shape[1],1).cuda()),-1) uncd = torch.cat((F.interpolate(oor2[:,np.newaxis],scale_factor=fscaled,mode='nearest'), F.interpolate(dc_unc[:,np.newaxis].detach(),scale_factor=fscaled,mode='nearest')),1) taud = F.interpolate(tau[:,np.newaxis],scale_factor=fscaled,mode='nearest').view(bs,1,-1) # sample correspondence for fg/bg seg (fscale) hp0 = F.interpolate(hp0o,scale_factor=fscale,mode='nearest') hp1 = F.interpolate(hp1o,scale_factor=fscale,mode='nearest') _,_,h,w=hp0.shape hp0 = hp0.view(1,2,-1).permute(0,2,1) hp1 = hp1.view(bs,2,-1).permute(0,2,1) hp0 = torch.cat((hp0,torch.ones(1,hp0.shape[1],1).cuda()),-1) hp1 = torch.cat((hp1,torch.ones(bs,hp0.shape[1],1).cuda()),-1) unc = torch.cat((F.interpolate(oor2[:,np.newaxis],scale_factor=fscale,mode='nearest'), F.interpolate(dc_unc[:,np.newaxis].detach(),scale_factor=fscale,mode='nearest')),1) tau = F.interpolate(tau[:,np.newaxis],scale_factor=fscale,mode='nearest').view(bs,1,-1) # sample correspondence for pose estimation (fscalex) hp0x = F.interpolate(hp0o,scale_factor=fscalex,mode='nearest') hp1x = F.interpolate(hp1o,scale_factor=fscalex,mode='nearest') hp0x = hp0x.view(1,2,-1).permute(0,2,1) hp1x = hp1x.view(bs,2,-1).permute(0,2,1) hp0x = torch.cat((hp0x,torch.ones(1,hp0x.shape[1],1).cuda()),-1) hp1x = torch.cat((hp1x,torch.ones(bs,hp0x.shape[1],1).cuda()),-1) ## camera pose estimation # using input pose from VONet rot = torch.from_numpy(cv2.Rodrigues(disc_aux[2][:,:3])[0][:,0].astype(np.float32)).unsqueeze(0) trans = torch.from_numpy(disc_aux[2][:,3:].astype(np.float32)).squeeze().unsqueeze(0) trans = trans / trans.norm(2,1)[:,np.newaxis] rot = rot.cuda().detach() trans = trans.cuda().detach() Ex = get_skew_mat(trans.cpu(), rot.cpu()) ## fg/bg segmentation # rigidity cost maps mcost00, mcost01, mcost1, mcost2, mcost3, mcost4, p3dmag,_ = compute_geo_costs(rot, trans, Ex, Kinv, hp0, hp1, tau, Kinv_n = Kinv_n) # depth contrast cost with torch.no_grad(): self.midas.eval() input_im = (disc_aux[1].permute(0,3,1,2) -\ torch.Tensor([0.485, 0.456, 0.406]).cuda()[np.newaxis,:,np.newaxis,np.newaxis]) /\ torch.Tensor([0.229, 0.224, 0.225]).cuda()[np.newaxis,:,np.newaxis,np.newaxis] wsize = int((input_im.shape[3] * 448./input_im.shape[2])//32*32) input_im = F.interpolate(input_im, (448, wsize), mode='bilinear') dispo = self.midas.forward(input_im)[None].clamp(1e-6,np.inf) disp = F.interpolate(dispo, [h,w], mode='bilinear') med_dgt = torch.median(disp.view(bs,-1),dim=-1)[0] med_dp3d = torch.median(p3dmag.view(bs,-1),dim=-1)[0] med_ratio = (med_dgt/med_dp3d)[:,np.newaxis,np.newaxis,np.newaxis] # disp[disp == float('inf')] = p3dmag.view(bs,1,h,w)[disp == float('inf')] * med_ratio log_dratio = ( med_ratio * p3dmag.view(bs,1,h,w) / disp.view(bs,1,h,w) ).log() #pdb.set_trace() # pseudo 3D point compute depth = (1./ disp).view(bs,1,-1) depth = depth.clamp(depth.median()/10, depth.median()*10) p03d = depth * Kinv.matmul(hp0.permute(0,2,1)) p13d = depth/tau*Kinv_n.matmul(hp1.permute(0,2,1)) p13d = kornia.angle_axis_to_rotation_matrix(rot).matmul(p13d) # remove rotation pts = torch.cat([p03d, p13d],-1) # bs, 3, 2*N # normalize it for i in range(bs): pts[i] = pts[i] - pts[i].mean(-1,keepdims=True) # zero mean pts[i] = pts[i] / pts[i].flatten().std() # unit std p03d = pts[:,:,:p03d.shape[-1]] p13d = pts[:,:,p03d.shape[-1]:] # fg/bg segmentation network # the constants are empirical values multiplied to cost maps to # ensure they have similar scales costs = torch.cat(( self.fgnetv1( 0.01*(mcost00+mcost01).view(bs,1,h,w).detach()), self.fgnetv2( 2e3* mcost1.view(bs,1,h,w).detach()), self.fgnetv3( mcost2.view(bs,1,h,w).detach()), self.fgnetv4( 30* mcost3.view(bs,1,h,w).detach()), self.fgnetv5( mcost4.view(bs,1,h,w).detach()), self.fgnetv6( 0.2* unc[:,:1].view(bs,1,h,w).detach()), self.fgnetv7( 0.2* unc[:,1:].view(bs,1,h,w).detach()), self.fgnetv8( 3* log_dratio.view(bs,1,h,w).detach()), self.fgnetv9( p03d.view(bs,3,h,w).detach()), self.fgnetv10( p13d.view(bs,3,h,w).detach()), ),1) x,featx = self.fgnet(costs) fg_va = 1./20*x[:,:-1] fg_res = 1./200*x[:,-1:] fg_hps = torch.cat( ( 0.01*(mcost00+mcost01).view(bs,1,h,w).detach(), 2e3* mcost1.view(bs,1,h,w).detach(), mcost2.view(bs,1,h,w).detach(), 30* mcost3.view(bs,1,h,w).detach(), mcost4.view(bs,1,h,w).detach(), 3* log_dratio.view(bs,1,h,w).detach(), ),1) # fgmask: prelogits of 0-1 probability foreground vs background fgmask = (fg_va * fg_hps).sum(1, keepdims=True) + fg_res fgmask = F.upsample(fgmask, [im.size()[2],im.size()[3]], mode='bilinear') return fgmask[0,0] ================================================ FILE: Network/rigidmask/__init__.py ================================================ ================================================ FILE: Network/rigidmask/conv4d.py ================================================ import pdb import torch.nn as nn import math import torch from torch.nn.parameter import Parameter import torch.nn.functional as F from torch.nn import Module from torch.nn.modules.conv import _ConvNd from torch.nn.modules.utils import _quadruple from torch.autograd import Variable from torch.nn import Conv2d def conv4d(data,filters,bias=None,permute_filters=True,use_half=False): """ This is done by stacking results of multiple 3D convolutions, and is very slow. Taken from https://github.com/ignacio-rocco/ncnet """ b,c,h,w,d,t=data.size() data=data.permute(2,0,1,3,4,5).contiguous() # permute to avoid making contiguous inside loop # Same permutation is done with filters, unless already provided with permutation if permute_filters: filters=filters.permute(2,0,1,3,4,5).contiguous() # permute to avoid making contiguous inside loop c_out=filters.size(1) if use_half: output = Variable(torch.HalfTensor(h,b,c_out,w,d,t),requires_grad=data.requires_grad) else: output = Variable(torch.zeros(h,b,c_out,w,d,t),requires_grad=data.requires_grad) padding=filters.size(0)//2 if use_half: Z=Variable(torch.zeros(padding,b,c,w,d,t).half()) else: Z=Variable(torch.zeros(padding,b,c,w,d,t)) if data.is_cuda: Z=Z.cuda(data.get_device()) output=output.cuda(data.get_device()) data_padded = torch.cat((Z,data,Z),0) for i in range(output.size(0)): # loop on first feature dimension # convolve with center channel of filter (at position=padding) output[i,:,:,:,:,:]=F.conv3d(data_padded[i+padding,:,:,:,:,:], filters[padding,:,:,:,:,:], bias=bias, stride=1, padding=padding) # convolve with upper/lower channels of filter (at postions [:padding] [padding+1:]) for p in range(1,padding+1): output[i,:,:,:,:,:]=output[i,:,:,:,:,:]+F.conv3d(data_padded[i+padding-p,:,:,:,:,:], filters[padding-p,:,:,:,:,:], bias=None, stride=1, padding=padding) output[i,:,:,:,:,:]=output[i,:,:,:,:,:]+F.conv3d(data_padded[i+padding+p,:,:,:,:,:], filters[padding+p,:,:,:,:,:], bias=None, stride=1, padding=padding) output=output.permute(1,2,0,3,4,5).contiguous() return output class Conv4d(_ConvNd): """Applies a 4D convolution over an input signal composed of several input planes. """ def __init__(self, in_channels, out_channels, kernel_size, bias=True, pre_permuted_filters=True): # stride, dilation and groups !=1 functionality not tested stride=1 dilation=1 groups=1 # zero padding is added automatically in conv4d function to preserve tensor size padding = 0 kernel_size = _quadruple(kernel_size) stride = _quadruple(stride) padding = _quadruple(padding) dilation = _quadruple(dilation) super(Conv4d, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _quadruple(0), groups, bias) # weights will be sliced along one dimension during convolution loop # make the looping dimension to be the first one in the tensor, # so that we don't need to call contiguous() inside the loop self.pre_permuted_filters=pre_permuted_filters if self.pre_permuted_filters: self.weight.data=self.weight.data.permute(2,0,1,3,4,5).contiguous() self.use_half=False # self.isbias = bias # if not self.isbias: # self.bn = torch.nn.BatchNorm1d(out_channels) def forward(self, input): out = conv4d(input, self.weight, bias=self.bias,permute_filters=not self.pre_permuted_filters,use_half=self.use_half) # filters pre-permuted in constructor # if not self.isbias: # b,c,u,v,h,w = out.shape # out = self.bn(out.view(b,c,-1)).view(b,c,u,v,h,w) return out class fullConv4d(torch.nn.Module): def __init__(self, in_channels, out_channels, kernel_size, bias=True, pre_permuted_filters=True): super(fullConv4d, self).__init__() self.conv = Conv4d(in_channels, out_channels, kernel_size, bias=bias, pre_permuted_filters=pre_permuted_filters) self.isbias = bias if not self.isbias: self.bn = torch.nn.BatchNorm1d(out_channels) def forward(self, input): out = self.conv(input) if not self.isbias: b,c,u,v,h,w = out.shape out = self.bn(out.view(b,c,-1)).view(b,c,u,v,h,w) return out class butterfly4D(torch.nn.Module): ''' butterfly 4d ''' def __init__(self, fdima, fdimb, withbn=True, full=True,groups=1): super(butterfly4D, self).__init__() self.proj = nn.Sequential(projfeat4d(fdima, fdimb, 1, with_bn=withbn,groups=groups), nn.ReLU(inplace=True),) self.conva1 = sepConv4dBlock(fdimb,fdimb,with_bn=withbn, stride=(2,1,1),full=full,groups=groups) self.conva2 = sepConv4dBlock(fdimb,fdimb,with_bn=withbn, stride=(2,1,1),full=full,groups=groups) self.convb3 = sepConv4dBlock(fdimb,fdimb,with_bn=withbn, stride=(1,1,1),full=full,groups=groups) self.convb2 = sepConv4dBlock(fdimb,fdimb,with_bn=withbn, stride=(1,1,1),full=full,groups=groups) self.convb1 = sepConv4dBlock(fdimb,fdimb,with_bn=withbn, stride=(1,1,1),full=full,groups=groups) #@profile def forward(self,x): out = self.proj(x) b,c,u,v,h,w = out.shape # 9x9 out1 = self.conva1(out) # 5x5, 3 _,c1,u1,v1,h1,w1 = out1.shape out2 = self.conva2(out1) # 3x3, 9 _,c2,u2,v2,h2,w2 = out2.shape out2 = self.convb3(out2) # 3x3, 9 tout1 = F.upsample(out2.view(b,c,u2,v2,-1),(u1,v1,h2*w2),mode='trilinear').view(b,c,u1,v1,h2,w2) # 5x5 tout1 = F.upsample(tout1.view(b,c,-1,h2,w2),(u1*v1,h1,w1),mode='trilinear').view(b,c,u1,v1,h1,w1) # 5x5 out1 = tout1 + out1 out1 = self.convb2(out1) tout = F.upsample(out1.view(b,c,u1,v1,-1),(u,v,h1*w1),mode='trilinear').view(b,c,u,v,h1,w1) tout = F.upsample(tout.view(b,c,-1,h1,w1),(u*v,h,w),mode='trilinear').view(b,c,u,v,h,w) out = tout + out out = self.convb1(out) return out class projfeat4d(torch.nn.Module): ''' Turn 3d projection into 2d projection ''' def __init__(self, in_planes, out_planes, stride, with_bn=True,groups=1): super(projfeat4d, self).__init__() self.with_bn = with_bn self.stride = stride self.conv1 = nn.Conv3d(in_planes, out_planes, 1, (stride,stride,1), padding=0,bias=not with_bn,groups=groups) self.bn = nn.BatchNorm3d(out_planes) def forward(self,x): b,c,u,v,h,w = x.size() x = self.conv1(x.view(b,c,u,v,h*w)) if self.with_bn: x = self.bn(x) _,c,u,v,_ = x.shape x = x.view(b,c,u,v,h,w) return x class sepConv4d(torch.nn.Module): ''' Separable 4d convolution block as 2 3D convolutions ''' def __init__(self, in_planes, out_planes, stride=(1,1,1), with_bn=True, ksize=3, full=True,groups=1): super(sepConv4d, self).__init__() bias = not with_bn self.isproj = False self.stride = stride[0] expand = 1 if with_bn: if in_planes != out_planes: self.isproj = True self.proj = nn.Sequential(nn.Conv2d(in_planes, out_planes, 1, bias=bias, padding=0,groups=groups), nn.BatchNorm2d(out_planes)) if full: self.conv1 = nn.Sequential(nn.Conv3d(in_planes*expand, in_planes, (1,ksize,ksize), stride=(1,self.stride,self.stride), bias=bias, padding=(0,ksize//2,ksize//2),groups=groups), nn.BatchNorm3d(in_planes)) else: self.conv1 = nn.Sequential(nn.Conv3d(in_planes*expand, in_planes, (1,ksize,ksize), stride=1, bias=bias, padding=(0,ksize//2,ksize//2),groups=groups), nn.BatchNorm3d(in_planes)) self.conv2 = nn.Sequential(nn.Conv3d(in_planes, in_planes*expand, (ksize,ksize,1), stride=(self.stride,self.stride,1), bias=bias, padding=(ksize//2,ksize//2,0),groups=groups), nn.BatchNorm3d(in_planes*expand)) else: if in_planes != out_planes: self.isproj = True self.proj = nn.Conv2d(in_planes, out_planes, 1, bias=bias, padding=0,groups=groups) if full: self.conv1 = nn.Conv3d(in_planes*expand, in_planes, (1,ksize,ksize), stride=(1,self.stride,self.stride), bias=bias, padding=(0,ksize//2,ksize//2),groups=groups) else: self.conv1 = nn.Conv3d(in_planes*expand, in_planes, (1,ksize,ksize), stride=1, bias=bias, padding=(0,ksize//2,ksize//2),groups=groups) self.conv2 = nn.Conv3d(in_planes, in_planes*expand, (ksize,ksize,1), stride=(self.stride,self.stride,1), bias=bias, padding=(ksize//2,ksize//2,0),groups=groups) self.relu = nn.ReLU(inplace=True) #@profile def forward(self,x): b,c,u,v,h,w = x.shape x = self.conv2(x.view(b,c,u,v,-1)) b,c,u,v,_ = x.shape x = self.relu(x) x = self.conv1(x.view(b,c,-1,h,w)) b,c,_,h,w = x.shape if self.isproj: x = self.proj(x.view(b,c,-1,w)) x = x.view(b,-1,u,v,h,w) return x class sepConv4dBlock(torch.nn.Module): ''' Separable 4d convolution block as 2 2D convolutions and a projection layer ''' def __init__(self, in_planes, out_planes, stride=(1,1,1), with_bn=True, full=True,groups=1): super(sepConv4dBlock, self).__init__() if in_planes == out_planes and stride==(1,1,1): self.downsample = None else: if full: self.downsample = sepConv4d(in_planes, out_planes, stride, with_bn=with_bn,ksize=1, full=full,groups=groups) else: self.downsample = projfeat4d(in_planes, out_planes,stride[0], with_bn=with_bn,groups=groups) self.conv1 = sepConv4d(in_planes, out_planes, stride, with_bn=with_bn, full=full ,groups=groups) self.conv2 = sepConv4d(out_planes, out_planes,(1,1,1), with_bn=with_bn, full=full,groups=groups) self.relu1 = nn.ReLU(inplace=True) self.relu2 = nn.ReLU(inplace=True) #@profile def forward(self,x): out = self.relu1(self.conv1(x)) if self.downsample: x = self.downsample(x) out = self.relu2(x + self.conv2(out)) return out ##import torch.backends.cudnn as cudnn ##cudnn.benchmark = True #import time ##im = torch.randn(9,64,9,160,224).cuda() ##net = torch.nn.Conv3d(64, 64, 3).cuda() ##net = Conv4d(1,1,3,bias=True,pre_permuted_filters=True).cuda() ##net = sepConv4dBlock(2,2,stride=(1,1,1)).cuda() # ##im = torch.randn(1,16,9,9,96,320).cuda() ##net = sepConv4d(16,16,with_bn=False).cuda() # ##im = torch.randn(1,16,81,96,320).cuda() ##net = torch.nn.Conv3d(16,16,(1,3,3),padding=(0,1,1)).cuda() # ##im = torch.randn(1,16,9,9,96*320).cuda() ##net = torch.nn.Conv3d(16,16,(3,3,1),padding=(1,1,0)).cuda() # ##im = torch.randn(10000,10,9,9).cuda() ##net = torch.nn.Conv2d(10,10,3,padding=1).cuda() # ##im = torch.randn(81,16,96,320).cuda() ##net = torch.nn.Conv2d(16,16,3,padding=1).cuda() #c= int(16 *1) #cp = int(16 *1) #h=int(96 *4) #w=int(320 *4) #k=3 #im = torch.randn(1,c,h,w).cuda() #net = torch.nn.Conv2d(c,cp,k,padding=k//2).cuda() # #im2 = torch.randn(cp,k*k*c).cuda() #im1 = F.unfold(im, (k,k), padding=k//2)[0] # # #net(im) #net(im) #torch.mm(im2,im1) #torch.mm(im2,im1) #torch.cuda.synchronize() #beg = time.time() #for i in range(100): # net(im) # #im1 = F.unfold(im, (k,k), padding=k//2)[0] # torch.mm(im2,im1) #torch.cuda.synchronize() #print('%f'%((time.time()-beg)*10.)) ================================================ FILE: Network/rigidmask/det.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import torchvision.models as models import torch import torch.nn as nn import os from .networks.msra_resnet import get_pose_net from .networks.dlav0 import get_pose_net as get_dlav0 from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn from .networks.large_hourglass import get_large_hourglass_net _model_factory = { 'res': get_pose_net, # default Resnet with deconv 'dlav0': get_dlav0, # default DLAup 'dla': get_dla_dcn, 'resdcn': get_pose_net_dcn, 'hourglass': get_large_hourglass_net, } def create_model(arch, heads, head_conv,num_input): num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0 arch = arch[:arch.find('_')] if '_' in arch else arch get_model = _model_factory[arch] model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv,num_input=num_input) return model def load_model(model, model_path, optimizer=None, resume=False, lr=None, lr_step=None): start_epoch = 0 checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) state_dict_ = checkpoint['state_dict'] state_dict = {} # convert data_parallal to model for k in state_dict_: if k.startswith('module') and not k.startswith('module_list'): state_dict[k[7:]] = state_dict_[k] else: state_dict[k] = state_dict_[k] model_state_dict = model.state_dict() # check loaded parameters and created model parameters msg = 'If you see this, your model does not fully load the ' + \ 'pre-trained weight. Please make sure ' + \ 'you have correctly specified --arch xxx ' + \ 'or set the correct --num_classes for your own dataset.' for k in state_dict: if k in model_state_dict: if state_dict[k].shape != model_state_dict[k].shape: print('Skip loading parameter {}, required shape{}, '\ 'loaded shape{}. {}'.format( k, model_state_dict[k].shape, state_dict[k].shape, msg)) state_dict[k] = model_state_dict[k] else: print('Drop parameter {}.'.format(k) + msg) for k in model_state_dict: if not (k in state_dict): print('No param {}.'.format(k) + msg) state_dict[k] = model_state_dict[k] model.load_state_dict(state_dict, strict=False) # resume optimizer parameters if optimizer is not None and resume: if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] start_lr = lr for step in lr_step: if start_epoch >= step: start_lr *= 0.1 for param_group in optimizer.param_groups: param_group['lr'] = start_lr print('Resumed optimizer with start lr', start_lr) else: print('No optimizer parameters in checkpoint.') if optimizer is not None: return model, optimizer, start_epoch else: return model def save_model(path, epoch, model, optimizer=None): if isinstance(model, torch.nn.DataParallel): state_dict = model.module.state_dict() else: state_dict = model.state_dict() data = {'epoch': epoch, 'state_dict': state_dict} if not (optimizer is None): data['optimizer'] = optimizer.state_dict() torch.save(data, path) ================================================ FILE: Network/rigidmask/det_losses.py ================================================ # ------------------------------------------------------------------------------ # Portions of this code are from # CornerNet (https://github.com/princeton-vl/CornerNet) # Copyright (c) 2018, University of Michigan # Licensed under the BSD 3-Clause License # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import pdb import torch import torch.nn as nn from .det_utils import _transpose_and_gather_feat import torch.nn.functional as F def _slow_neg_loss(pred, gt): '''focal loss from CornerNet''' pos_inds = gt.eq(1) neg_inds = gt.lt(1) neg_weights = torch.pow(1 - gt[neg_inds], 4) loss = 0 pos_pred = pred[pos_inds] neg_pred = pred[neg_inds] pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2) neg_loss = torch.log(1 - neg_pred) * torch.pow(neg_pred, 2) * neg_weights num_pos = pos_inds.float().sum() pos_loss = pos_loss.sum() neg_loss = neg_loss.sum() if pos_pred.nelement() == 0: loss = loss - neg_loss else: loss = loss - (pos_loss + neg_loss) / num_pos return loss def _neg_loss(pred, gt, heat_logits): ''' Modified focal loss. Exactly the same as CornerNet. Runs faster and costs a little bit more memory Arguments: pred (batch x c x h x w) gt_regr (batch x c x h x w) ''' pos_inds = gt.eq(1).float() neg_inds = gt.lt(1).float() neg_weights = torch.pow(1 - gt, 4) loss = 0 logpred = torch.nn.functional.log_softmax(heat_logits,1) pos_loss = logpred[:,0:1] * torch.pow(1 - pred, 2) * pos_inds neg_loss = logpred[:,1:2] * torch.pow(pred, 2) * neg_weights * neg_inds num_pos = pos_inds.float().sum() pos_loss = pos_loss.sum() neg_loss = neg_loss.sum() if num_pos == 0: loss = loss - neg_loss else: loss = loss - (pos_loss + neg_loss) / num_pos return loss def _not_faster_neg_loss(pred, gt): pos_inds = gt.eq(1).float() neg_inds = gt.lt(1).float() num_pos = pos_inds.float().sum() neg_weights = torch.pow(1 - gt, 4) loss = 0 trans_pred = pred * neg_inds + (1 - pred) * pos_inds weight = neg_weights * neg_inds + pos_inds all_loss = torch.log(1 - trans_pred) * torch.pow(trans_pred, 2) * weight all_loss = all_loss.sum() if num_pos > 0: all_loss /= num_pos loss -= all_loss return loss def _slow_reg_loss(regr, gt_regr, mask): num = mask.float().sum() mask = mask.unsqueeze(2).expand_as(gt_regr) regr = regr[mask] gt_regr = gt_regr[mask] regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False) regr_loss = regr_loss / (num + 1e-4) return regr_loss def _reg_loss(regr, gt_regr, mask): ''' L1 regression loss Arguments: regr (batch x max_objects x dim) gt_regr (batch x max_objects x dim) mask (batch x max_objects) ''' num = mask.float().sum() mask = mask.unsqueeze(2).expand_as(gt_regr).float() regr = regr * mask gt_regr = gt_regr * mask regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False) regr_loss = regr_loss / (num + 1e-4) return regr_loss class FocalLoss(nn.Module): '''nn.Module warpper for focal loss''' def __init__(self): super(FocalLoss, self).__init__() self.neg_loss = _neg_loss def forward(self, out, target, logits): return self.neg_loss(out, target, logits) class RegLoss(nn.Module): '''Regression loss for an output tensor Arguments: output (batch x dim x h x w) mask (batch x max_objects) ind (batch x max_objects) target (batch x max_objects x dim) ''' def __init__(self): super(RegLoss, self).__init__() def forward(self, output, mask, ind, target): pred = _transpose_and_gather_feat(output, ind) loss = _reg_loss(pred, target, mask) return loss class RegL1Loss(nn.Module): def __init__(self): super(RegL1Loss, self).__init__() def forward(self, output, mask, ind, target): pred = _transpose_and_gather_feat(output, ind) mask = mask.unsqueeze(2).expand_as(pred).float() # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') loss = F.l1_loss(pred * mask, target * mask, size_average=False) loss = loss / (mask.sum() + 1e-4) return loss class NormRegL1Loss(nn.Module): def __init__(self): super(NormRegL1Loss, self).__init__() def forward(self, output, mask, ind, target): pred = _transpose_and_gather_feat(output, ind) mask = mask.unsqueeze(2).expand_as(pred).float() # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') pred = pred / (target + 1e-4) target = target * 0 + 1 loss = F.l1_loss(pred * mask, target * mask, size_average=False) loss = loss / (mask.sum() + 1e-4) return loss class RegWeightedL1Loss(nn.Module): def __init__(self): super(RegWeightedL1Loss, self).__init__() def forward(self, output, mask, ind, target): pred = _transpose_and_gather_feat(output, ind) mask = mask.float() # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') loss = F.l1_loss(pred * mask, target * mask, size_average=False) loss = loss / (mask.sum() + 1e-4) return loss class L1Loss(nn.Module): def __init__(self): super(L1Loss, self).__init__() def forward(self, output, mask, ind, target): pred = _transpose_and_gather_feat(output, ind) mask = mask.unsqueeze(2).expand_as(pred).float() loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') return loss class BinRotLoss(nn.Module): def __init__(self): super(BinRotLoss, self).__init__() def forward(self, output, mask, ind, rotbin, rotres): pred = _transpose_and_gather_feat(output, ind) loss = compute_rot_loss(pred, rotbin, rotres, mask) return loss def compute_res_loss(output, target): return F.smooth_l1_loss(output, target, reduction='elementwise_mean') # TODO: weight def compute_bin_loss(output, target, mask): mask = mask.expand_as(output) output = output * mask.float() return F.cross_entropy(output, target, reduction='elementwise_mean') def compute_rot_loss(output, target_bin, target_res, mask): # output: (B, 128, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos, # bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos] # target_bin: (B, 128, 2) [bin1_cls, bin2_cls] # target_res: (B, 128, 2) [bin1_res, bin2_res] # mask: (B, 128, 1) # import pdb; pdb.set_trace() output = output.view(-1, 8) target_bin = target_bin.view(-1, 2) target_res = target_res.view(-1, 2) mask = mask.view(-1, 1) loss_bin1 = compute_bin_loss(output[:, 0:2], target_bin[:, 0], mask) loss_bin2 = compute_bin_loss(output[:, 4:6], target_bin[:, 1], mask) loss_res = torch.zeros_like(loss_bin1) if target_bin[:, 0].nonzero().shape[0] > 0: idx1 = target_bin[:, 0].nonzero()[:, 0] valid_output1 = torch.index_select(output, 0, idx1.long()) valid_target_res1 = torch.index_select(target_res, 0, idx1.long()) loss_sin1 = compute_res_loss( valid_output1[:, 2], torch.sin(valid_target_res1[:, 0])) loss_cos1 = compute_res_loss( valid_output1[:, 3], torch.cos(valid_target_res1[:, 0])) loss_res += loss_sin1 + loss_cos1 if target_bin[:, 1].nonzero().shape[0] > 0: idx2 = target_bin[:, 1].nonzero()[:, 0] valid_output2 = torch.index_select(output, 0, idx2.long()) valid_target_res2 = torch.index_select(target_res, 0, idx2.long()) loss_sin2 = compute_res_loss( valid_output2[:, 6], torch.sin(valid_target_res2[:, 1])) loss_cos2 = compute_res_loss( valid_output2[:, 7], torch.cos(valid_target_res2[:, 1])) loss_res += loss_sin2 + loss_cos2 return loss_bin1 + loss_bin2 + loss_res ================================================ FILE: Network/rigidmask/det_utils.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import torch import torch.nn as nn def _sigmoid(x): y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) return y def _gather_feat(feat, ind, mask=None): dim = feat.size(2) ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) feat = feat.gather(1, ind) if mask is not None: mask = mask.unsqueeze(2).expand_as(feat) feat = feat[mask] feat = feat.view(-1, dim) return feat def _transpose_and_gather_feat(feat, ind): feat = feat.permute(0, 2, 3, 1).contiguous() feat = feat.view(feat.size(0), -1, feat.size(3)) feat = _gather_feat(feat, ind) return feat def flip_tensor(x): return torch.flip(x, [3]) # tmp = x.detach().cpu().numpy()[..., ::-1].copy() # return torch.from_numpy(tmp).to(x.device) def flip_lr(x, flip_idx): tmp = x.detach().cpu().numpy()[..., ::-1].copy() shape = tmp.shape for e in flip_idx: tmp[:, e[0], ...], tmp[:, e[1], ...] = \ tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() return torch.from_numpy(tmp.reshape(shape)).to(x.device) def flip_lr_off(x, flip_idx): tmp = x.detach().cpu().numpy()[..., ::-1].copy() shape = tmp.shape tmp = tmp.reshape(tmp.shape[0], 17, 2, tmp.shape[2], tmp.shape[3]) tmp[:, :, 0, :, :] *= -1 for e in flip_idx: tmp[:, e[0], ...], tmp[:, e[1], ...] = \ tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() return torch.from_numpy(tmp.reshape(shape)).to(x.device) ================================================ FILE: Network/rigidmask/networks/DCNv2/.gitignore ================================================ .vscode .idea *.so *.o *pyc _ext build DCNv2.egg-info dist ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/__init__.py ================================================ from .dcn_v2 import * ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/dcn_v2.py ================================================ #!/usr/bin/env python from __future__ import absolute_import from __future__ import print_function from __future__ import division import math import torch from torch import nn from torch.autograd import Function from torch.nn.modules.utils import _pair from torch.autograd.function import once_differentiable import _ext as _backend class _DCNv2(Function): @staticmethod def forward(ctx, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups): ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.kernel_size = _pair(weight.shape[2:4]) ctx.deformable_groups = deformable_groups output = _backend.dcn_v2_forward(input, weight, bias, offset, mask, ctx.kernel_size[0], ctx.kernel_size[1], ctx.stride[0], ctx.stride[1], ctx.padding[0], ctx.padding[1], ctx.dilation[0], ctx.dilation[1], ctx.deformable_groups) ctx.save_for_backward(input, offset, mask, weight, bias) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, offset, mask, weight, bias = ctx.saved_tensors grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ _backend.dcn_v2_backward(input, weight, bias, offset, mask, grad_output, ctx.kernel_size[0], ctx.kernel_size[1], ctx.stride[0], ctx.stride[1], ctx.padding[0], ctx.padding[1], ctx.dilation[0], ctx.dilation[1], ctx.deformable_groups) return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ None, None, None, None, dcn_v2_conv = _DCNv2.apply class DCNv2(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, deformable_groups=1): super(DCNv2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.deformable_groups = deformable_groups self.weight = nn.Parameter(torch.Tensor( out_channels, in_channels, *self.kernel_size)) self.bias = nn.Parameter(torch.Tensor(out_channels)) self.reset_parameters() def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) self.bias.data.zero_() def forward(self, input, offset, mask): assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ offset.shape[1] assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ mask.shape[1] return dcn_v2_conv(input, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.deformable_groups) class DCN(DCNv2): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, deformable_groups=1): super(DCN, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, deformable_groups) channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] self.conv_offset_mask = nn.Conv2d(self.in_channels, channels_, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True) self.init_offset() def init_offset(self): self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() def forward(self, input): out = self.conv_offset_mask(input) o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) return dcn_v2_conv(input, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.deformable_groups) class _DCNv2Pooling(Function): @staticmethod def forward(ctx, input, rois, offset, spatial_scale, pooled_size, output_dim, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0): ctx.spatial_scale = spatial_scale ctx.no_trans = int(no_trans) ctx.output_dim = output_dim ctx.group_size = group_size ctx.pooled_size = pooled_size ctx.part_size = pooled_size if part_size is None else part_size ctx.sample_per_part = sample_per_part ctx.trans_std = trans_std output, output_count = \ _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, ctx.no_trans, ctx.spatial_scale, ctx.output_dim, ctx.group_size, ctx.pooled_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std) ctx.save_for_backward(input, rois, offset, output_count) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, rois, offset, output_count = ctx.saved_tensors grad_input, grad_offset = \ _backend.dcn_v2_psroi_pooling_backward(grad_output, input, rois, offset, output_count, ctx.no_trans, ctx.spatial_scale, ctx.output_dim, ctx.group_size, ctx.pooled_size, ctx.part_size, ctx.sample_per_part, ctx.trans_std) return grad_input, None, grad_offset, \ None, None, None, None, None, None, None, None dcn_v2_pooling = _DCNv2Pooling.apply class DCNv2Pooling(nn.Module): def __init__(self, spatial_scale, pooled_size, output_dim, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0): super(DCNv2Pooling, self).__init__() self.spatial_scale = spatial_scale self.pooled_size = pooled_size self.output_dim = output_dim self.no_trans = no_trans self.group_size = group_size self.part_size = pooled_size if part_size is None else part_size self.sample_per_part = sample_per_part self.trans_std = trans_std def forward(self, input, rois, offset): assert input.shape[1] == self.output_dim if self.no_trans: offset = input.new() return dcn_v2_pooling(input, rois, offset, self.spatial_scale, self.pooled_size, self.output_dim, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) class DCNPooling(DCNv2Pooling): def __init__(self, spatial_scale, pooled_size, output_dim, no_trans, group_size=1, part_size=None, sample_per_part=4, trans_std=.0, deform_fc_dim=1024): super(DCNPooling, self).__init__(spatial_scale, pooled_size, output_dim, no_trans, group_size, part_size, sample_per_part, trans_std) self.deform_fc_dim = deform_fc_dim if not no_trans: self.offset_mask_fc = nn.Sequential( nn.Linear(self.pooled_size * self.pooled_size * self.output_dim, self.deform_fc_dim), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_dim, self.deform_fc_dim), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3) ) self.offset_mask_fc[4].weight.data.zero_() self.offset_mask_fc[4].bias.data.zero_() def forward(self, input, rois): offset = input.new() if not self.no_trans: # do roi_align first n = rois.shape[0] roi = dcn_v2_pooling(input, rois, offset, self.spatial_scale, self.pooled_size, self.output_dim, True, # no trans self.group_size, self.part_size, self.sample_per_part, self.trans_std) # build mask and offset offset_mask = self.offset_mask_fc(roi.view(n, -1)) offset_mask = offset_mask.view( n, 3, self.pooled_size, self.pooled_size) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) # do pooling with offset and mask return dcn_v2_pooling(input, rois, offset, self.spatial_scale, self.pooled_size, self.output_dim, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) * mask # only roi_align return dcn_v2_pooling(input, rois, offset, self.spatial_scale, self.pooled_size, self.output_dim, self.no_trans, self.group_size, self.part_size, self.sample_per_part, self.trans_std) ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cpu/dcn_v2_cpu.cpp ================================================ #include #include "cpu/dcn_v2_im2col_cpu.h" #include #include //#include #include //#include //#include //extern THCState *state; // author: Charles Shang // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu // modified from the CUDA version for CPU use by Daniel K. Suhendro // edit by: James Bockman and Matthew Howe // modified for torch implementation to remove use of deprecated torch access to Blas at::Tensor dcn_v2_cpu_forward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int deformable_group) { // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); // printf("Channels: %d %d\n", channels, channels_kernel); // printf("Channels: %d %d\n", channels_out, channels_kernel); AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); AT_ASSERTM(channels == channels_kernel, "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; // auto ones = at::ones({height_out, width_out}, input.options()); auto ones = at::ones({bias.sizes()[0], height_out, width_out}, input.options()); auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); auto output = at::zeros({batch, channels_out, height_out, width_out}, input.options()); using scalar_t = float; for (int b = 0; b < batch; b++) { auto input_n = input.select(0, b); auto offset_n = offset.select(0, b); auto mask_n = mask.select(0, b); auto output_n = output.select(0, b); // std::cout << "output_n: " << output_n << "output.select(0,b): " << output.select(0,b) << "\n"; // Do Bias first: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) // (N x 1) (1 x M) // torch implementation auto ones_T = at::transpose(ones.contiguous(), 2, 0); ones_T = at::mul(ones_T, bias.contiguous()); ones_T = at::transpose(ones_T, 2, 0); output_n = at::add(output_n, ones_T); modulated_deformable_im2col_cpu(input_n.data_ptr(), offset_n.data_ptr(), mask_n.data_ptr(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns.data_ptr()); //(k * m) x (m * n) // Y = WC // torch implementation auto weight_flat = weight.view({channels_out, channels * kernel_h * kernel_w}); auto product = at::matmul(weight_flat, columns); output.select(0, b) = at::add(output_n, product.view({channels_out, height_out, width_out})); } return output; } std::vector dcn_v2_cpu_backward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const at::Tensor &grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int deformable_group) { THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); AT_ASSERTM(channels == channels_kernel, "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; auto ones = at::ones({height_out, width_out}, input.options()); auto columns = at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); auto grad_input = at::zeros_like(input); auto grad_weight = at::zeros_like(weight); auto grad_bias = at::zeros_like(bias); auto grad_offset = at::zeros_like(offset); auto grad_mask = at::zeros_like(mask); using scalar_t = float; for (int b = 0; b < batch; b++) { auto input_n = input.select(0, b); auto offset_n = offset.select(0, b); auto mask_n = mask.select(0, b); auto grad_output_n = grad_output.select(0, b); auto grad_input_n = grad_input.select(0, b); auto grad_offset_n = grad_offset.select(0, b); auto grad_mask_n = grad_mask.select(0, b); // Torch implementation auto weight_flat = weight.view({channels_out, channels*kernel_h*kernel_w}); weight_flat = at::transpose(weight_flat, 1, 0); auto grad_output_n_flat = grad_output_n.view({channels_out, height_out*width_out}); columns = at::matmul(weight_flat, grad_output_n_flat); // gradient w.r.t. input coordinate data modulated_deformable_col2im_coord_cpu(columns.data_ptr(), input_n.data_ptr(), offset_n.data_ptr(), mask_n.data_ptr(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_offset_n.data_ptr(), grad_mask_n.data_ptr()); // gradient w.r.t. input data modulated_deformable_col2im_cpu(columns.data_ptr(), offset_n.data_ptr(), mask_n.data_ptr(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_input_n.data_ptr()); // gradient w.r.t. weight, dWeight should accumulate across the batch and group modulated_deformable_im2col_cpu(input_n.data_ptr(), offset_n.data_ptr(), mask_n.data_ptr(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns.data_ptr()); // Torch implementation auto product = at::matmul(grad_output_n_flat, at::transpose(columns, 1, 0)); grad_weight = at::add(grad_weight, product.view({channels_out, channels, kernel_h, kernel_w})); // Torch implementation auto ones_flat = ones.view({height_out*width_out}); product = at::matmul(grad_output_n_flat, ones_flat); grad_bias = at::add(grad_bias, product); } return { grad_input, grad_offset, grad_mask, grad_weight, grad_bias }; } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cpu/dcn_v2_im2col_cpu.cpp ================================================ #include "dcn_v2_im2col_cpu.h" #include #include #include #include //#include #include //#include //#include // modified from the CUDA version for CPU use by Daniel K. Suhendro /*#define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; }*/ float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width, const int height, const int width, float h, float w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; float lh = h - h_low; float lw = w - w_low; float hh = 1 - lh, hw = 1 - lw; float v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; float v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; float v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; float v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w, const int height, const int width, const float *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, float *data_col) { // launch channels * batch_size * height_col * width_col cores for(int index=0; index(0); const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const float map_h = i * dilation_h + offset_h; //const float map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; // data_col_ptr += batch_size * height_col * width_col; data_col_ptr += height_col * width_col; } } } } void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, float *grad_im) { for(int index = 0; index < n; index++) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; const float cur_inv_h_data = h_in + i * dilation_h + offset_h; const float cur_inv_w_data = w_in + j * dilation_w + offset_w; const float cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad; } } } } } void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, float *grad_offset, float *grad_mask) { for(int index = 0; index < n; index++) { float val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; float inv_h = h_in + i * dilation_h + offset_h; float inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const float weight = dmcn_get_coordinate_weight_cpu( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; modulated_deformable_im2col_cpu_kernel( num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col); /*cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); }*/ } void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_im){ const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; modulated_deformable_col2im_cpu_kernel( num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im); /*cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); }*/ } void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_offset, float* grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; modulated_deformable_col2im_coord_cpu_kernel( num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset, grad_mask); /*cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); }*/ } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cpu/dcn_v2_im2col_cpu.h ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.h * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1811.11168 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu */ /***************** Adapted by Charles Shang *********************/ // modified from the CUDA version for CPU use by Daniel K. Suhendro #ifndef DCN_V2_IM2COL_CPU #define DCN_V2_IM2COL_CPU #ifdef __cplusplus extern "C" { #endif void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *data_col); void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_im); void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_offset, float *grad_mask); #ifdef __cplusplus } #endif #endif ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cpu/dcn_v2_psroi_pooling_cpu.cpp ================================================ /*! * Copyright (c) 2017 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file deformable_psroi_pooling.cu * \brief * \author Yi Li, Guodong Zhang, Jifeng Dai */ /***************** Adapted by Charles Shang *********************/ // modified from the CUDA version for CPU use by Daniel K. Suhendro #include #include #include #include //#include #include //#include //#include /*#define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; }*/ template T bilinear_interp_cpu( const T *data, const T x, const T y, const int width, const int height) { int x1 = floor(x); int x2 = ceil(x); int y1 = floor(y); int y2 = ceil(y); T dist_x = static_cast(x - x1); T dist_y = static_cast(y - y1); T value11 = data[y1 * width + x1]; T value12 = data[y2 * width + x1]; T value21 = data[y1 * width + x2]; T value22 = data[y2 * width + x2]; T value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; return value; } template void DeformablePSROIPoolForwardKernelCpu( const int count, const T *bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const T *bottom_rois, const T *bottom_trans, const int no_trans, const T trans_std, const int sample_per_part, const int output_dim, const int group_size, const int part_size, const int num_classes, const int channels_each_class, T *top_data, T *top_count) { for(int index = 0; index < count; index++) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const T *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); // Compute w and h at bottom T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); int part_h = floor(static_cast(ph) / pooled_height * part_size); int part_w = floor(static_cast(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; T wstart = static_cast(pw) * bin_size_w + roi_start_w; wstart += trans_x * roi_width; T hstart = static_cast(ph) * bin_size_h + roi_start_h; hstart += trans_y * roi_height; T sum = 0; int count = 0; int gw = floor(static_cast(pw) * group_size / pooled_width); int gh = floor(static_cast(ph) * group_size / pooled_height); gw = std::min(std::max(gw, 0), group_size - 1); gh = std::min(std::max(gh, 0), group_size - 1); const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { T w = wstart + iw * sub_bin_size_w; T h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = std::min(std::max(w, T(0.)), width - T(1.)); h = std::min(std::max(h, T(0.)), height - T(1.)); int c = (ctop * group_size + gh) * group_size + gw; T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height); sum += val; count++; } } top_data[index] = count == 0 ? static_cast(0) : sum / count; top_count[index] = count; } } template void DeformablePSROIPoolBackwardAccKernelCpu( const int count, const T *top_diff, const T *top_count, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int output_dim, T *bottom_data_diff, T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois, const T *bottom_trans, const int no_trans, const T trans_std, const int sample_per_part, const int group_size, const int part_size, const int num_classes, const int channels_each_class) { for(int index = 0; index < count; index++) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const T *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); // Compute w and h at bottom T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); int part_h = floor(static_cast(ph) / pooled_height * part_size); int part_w = floor(static_cast(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; T wstart = static_cast(pw) * bin_size_w + roi_start_w; wstart += trans_x * roi_width; T hstart = static_cast(ph) * bin_size_h + roi_start_h; hstart += trans_y * roi_height; if (top_count[index] <= 0) { continue; } T diff_val = top_diff[index] / top_count[index]; const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; int gw = floor(static_cast(pw) * group_size / pooled_width); int gh = floor(static_cast(ph) * group_size / pooled_height); gw = std::min(std::max(gw, 0), group_size - 1); gh = std::min(std::max(gh, 0), group_size - 1); for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { T w = wstart + iw * sub_bin_size_w; T h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = std::min(std::max(w, T(0.)), width - T(1.)); h = std::min(std::max(h, T(0.)), height - T(1.)); int c = (ctop * group_size + gh) * group_size + gw; // backward on feature int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); int y1 = ceil(h); T dist_x = w - x0, dist_y = h - y0; T q00 = (1 - dist_x) * (1 - dist_y); T q01 = (1 - dist_x) * dist_y; T q10 = dist_x * (1 - dist_y); T q11 = dist_x * dist_y; int bottom_index_base = c * height * width; /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/ *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val; *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val; *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val; *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val; if (no_trans) { continue; } T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; diff_x *= roi_width; T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/ *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x; *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y; } } } } std::tuple dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/ const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); auto pooled_height = pooled_size; auto pooled_width = pooled_size; auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); long out_size = num_bbox * output_dim * pooled_height * pooled_width; auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; //cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (out.numel() == 0) { //THCudaCheck(cudaGetLastError()); return std::make_tuple(out, top_count); } /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); dim3 block(512);*/ AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] { DeformablePSROIPoolForwardKernelCpu( out_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, bbox.contiguous().data(), trans.contiguous().data(), no_trans, trans_std, sample_per_part, output_dim, group_size, part_size, num_classes, channels_each_class, out.data(), top_count.data()); }); //THCudaCheck(cudaGetLastError()); return std::make_tuple(out, top_count); } std::tuple dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const at::Tensor &top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/ const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); auto pooled_height = pooled_size; auto pooled_width = pooled_size; long out_size = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); auto trans_grad = at::zeros_like(trans); if (input_grad.numel() == 0) { //THCudaCheck(cudaGetLastError()); return std::make_tuple(input_grad, trans_grad); } /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); dim3 block(512); cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/ AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] { DeformablePSROIPoolBackwardAccKernelCpu( out_size, out_grad.contiguous().data(), top_count.contiguous().data(), num_bbox, spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, input_grad.contiguous().data(), trans_grad.contiguous().data(), input.contiguous().data(), bbox.contiguous().data(), trans.contiguous().data(), no_trans, trans_std, sample_per_part, group_size, part_size, num_classes, channels_each_class); }); //THCudaCheck(cudaGetLastError()); return std::make_tuple(input_grad, trans_grad); } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cpu/vision.h ================================================ #pragma once #include at::Tensor dcn_v2_cpu_forward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int deformable_group); std::vector dcn_v2_cpu_backward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const at::Tensor &grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int deformable_group); std::tuple dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); std::tuple dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const at::Tensor &top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cuda/dcn_v2_cuda.cu ================================================ #include #include "cuda/dcn_v2_im2col_cuda.h" #include #include #include #include #include THCState *state = at::globalContext().lazyInitCUDA(); // author: Charles Shang // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu // [batch gemm] // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu __global__ void createBatchGemmBuffer(const float **input_b, float **output_b, float **columns_b, const float **ones_b, const float **weight_b, const float **bias_b, float *input, float *output, float *columns, float *ones, float *weight, float *bias, const int input_stride, const int output_stride, const int columns_stride, const int ones_stride, const int num_batches) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_batches) { input_b[idx] = input + idx * input_stride; output_b[idx] = output + idx * output_stride; columns_b[idx] = columns + idx * columns_stride; ones_b[idx] = ones + idx * ones_stride; // share weights and bias within a Mini-Batch weight_b[idx] = weight; bias_b[idx] = bias; } } at::Tensor dcn_v2_cuda_forward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int deformable_group) { using scalar_t = float; // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); // printf("Channels: %d %d\n", channels, channels_kernel); // printf("Channels: %d %d\n", channels_out, channels_kernel); AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); AT_ASSERTM(channels == channels_kernel, "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; auto ones = at::ones({batch, height_out, width_out}, input.options()); auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); // prepare for batch-wise computing, which is significantly faster than instance-wise computing // when batch size is large. // launch batch threads int matrices_size = batch * sizeof(float *); auto input_b = static_cast(THCudaMalloc(state, matrices_size)); auto output_b = static_cast(THCudaMalloc(state, matrices_size)); auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); const int block = 128; const int grid = (batch + block - 1) / block; createBatchGemmBuffer<<>>( input_b, output_b, columns_b, ones_b, weight_b, bias_b, input.data(), output.data(), columns.data(), ones.data(), weight.data(), bias.data(), channels * width * height, channels_out * width_out * height_out, channels * kernel_h * kernel_w * height_out * width_out, height_out * width_out, batch); long m_ = channels_out; long n_ = height_out * width_out; long k_ = 1; THCudaBlas_SgemmBatched(state, 't', 'n', n_, m_, k_, 1.0f, ones_b, k_, bias_b, k_, 0.0f, output_b, n_, batch); modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), input.data(), offset.data(), mask.data(), batch, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns.data()); long m = channels_out; long n = height_out * width_out; long k = channels * kernel_h * kernel_w; THCudaBlas_SgemmBatched(state, 'n', 'n', n, m, k, 1.0f, (const float **)columns_b, n, weight_b, k, 1.0f, output_b, n, batch); THCudaFree(state, input_b); THCudaFree(state, output_b); THCudaFree(state, columns_b); THCudaFree(state, ones_b); THCudaFree(state, weight_b); THCudaFree(state, bias_b); return output; } __global__ void createBatchGemmBufferBackward( float **grad_output_b, float **columns_b, float **ones_b, float **weight_b, float **grad_weight_b, float **grad_bias_b, float *grad_output, float *columns, float *ones, float *weight, float *grad_weight, float *grad_bias, const int grad_output_stride, const int columns_stride, const int ones_stride, const int num_batches) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num_batches) { grad_output_b[idx] = grad_output + idx * grad_output_stride; columns_b[idx] = columns + idx * columns_stride; ones_b[idx] = ones + idx * ones_stride; // share weights and bias within a Mini-Batch weight_b[idx] = weight; grad_weight_b[idx] = grad_weight; grad_bias_b[idx] = grad_bias; } } std::vector dcn_v2_cuda_backward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const at::Tensor &grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int deformable_group) { THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); AT_ASSERTM(channels == channels_kernel, "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; auto ones = at::ones({height_out, width_out}, input.options()); auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); auto grad_input = at::zeros_like(input); auto grad_weight = at::zeros_like(weight); auto grad_bias = at::zeros_like(bias); auto grad_offset = at::zeros_like(offset); auto grad_mask = at::zeros_like(mask); using scalar_t = float; for (int b = 0; b < batch; b++) { auto input_n = input.select(0, b); auto offset_n = offset.select(0, b); auto mask_n = mask.select(0, b); auto grad_output_n = grad_output.select(0, b); auto grad_input_n = grad_input.select(0, b); auto grad_offset_n = grad_offset.select(0, b); auto grad_mask_n = grad_mask.select(0, b); long m = channels * kernel_h * kernel_w; long n = height_out * width_out; long k = channels_out; THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, grad_output_n.data(), n, weight.data(), m, 0.0f, columns.data(), n); // gradient w.r.t. input coordinate data modulated_deformable_col2im_coord_cuda(c10::cuda::getCurrentCUDAStream(), columns.data(), input_n.data(), offset_n.data(), mask_n.data(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_offset_n.data(), grad_mask_n.data()); // gradient w.r.t. input data modulated_deformable_col2im_cuda(c10::cuda::getCurrentCUDAStream(), columns.data(), offset_n.data(), mask_n.data(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_input_n.data()); // gradient w.r.t. weight, dWeight should accumulate across the batch and group modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), input_n.data(), offset_n.data(), mask_n.data(), 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns.data()); long m_ = channels_out; long n_ = channels * kernel_h * kernel_w; long k_ = height_out * width_out; THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, columns.data(), k_, grad_output_n.data(), k_, 1.0f, grad_weight.data(), n_); // gradient w.r.t. bias // long m_ = channels_out; // long k__ = height_out * width_out; // THCudaBlas_Sgemm(state, // 't', 'n', // k_, m_, 1, 1.0f, // grad_output_n.data(), k_, // ones.data(), 1, 1.0f, // grad_bias.data(), 1); THCudaBlas_Sgemm(state, 'N', 'N', 1, m_, k_, 1.0f, ones.data(), 1, grad_output_n.data(), k_, 1.0f, grad_bias.data(), 1); } return { grad_input, grad_offset, grad_mask, grad_weight, grad_bias }; } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cuda/dcn_v2_im2col_cuda.cu ================================================ #include "dcn_v2_im2col_cuda.h" #include #include #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } __device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width, const int height, const int width, float h, float w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; float lh = h - h_low; float lw = w - w_low; float hh = 1 - lh, hw = 1 - lw; float v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; float v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; float v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; float v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } __device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } __device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w, const int height, const int width, const float *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } __global__ void modulated_deformable_im2col_gpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, float *data_col) { // launch channels * batch_size * height_col * width_col cores CUDA_KERNEL_LOOP(index, n) { // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; // const int b_col = (index / width_col / height_col) % batch_size; const int b_col = (index / width_col / height_col / num_channels) % batch_size; // const int c_im = (index / width_col / height_col) / batch_size; const int c_im = (index / width_col / height_col) % num_channels; // const int c_col = c_im * kernel_h * kernel_w; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; float val = static_cast(0); const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const float map_h = i * dilation_h + offset_h; //const float map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; // data_col_ptr += batch_size * height_col * width_col; data_col_ptr += height_col * width_col; } } } } __global__ void modulated_deformable_col2im_gpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, float *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; const float cur_inv_h_data = h_in + i * dilation_h + offset_h; const float cur_inv_w_data = w_in + j * dilation_w + offset_w; const float cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, float *grad_offset, float *grad_mask) { CUDA_KERNEL_LOOP(index, n) { float val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; float inv_h = h_in + i * dilation_h + offset_h; float inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const float weight = dmcn_get_coordinate_weight_cuda( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } void modulated_deformable_im2col_cuda(cudaStream_t stream, const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; modulated_deformable_im2col_gpu_kernel <<>>( num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_cuda(cudaStream_t stream, const float* data_col, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_im){ const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; modulated_deformable_col2im_gpu_kernel <<>>( num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_offset, float* grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; modulated_deformable_col2im_coord_gpu_kernel <<>>( num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset, grad_mask); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cuda/dcn_v2_im2col_cuda.h ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.h * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1811.11168 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu */ /***************** Adapted by Charles Shang *********************/ #ifndef DCN_V2_IM2COL_CUDA #define DCN_V2_IM2COL_CUDA #ifdef __cplusplus extern "C" { #endif void modulated_deformable_im2col_cuda(cudaStream_t stream, const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *data_col); void modulated_deformable_col2im_cuda(cudaStream_t stream, const float *data_col, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_im); void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_offset, float *grad_mask); #ifdef __cplusplus } #endif #endif ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cuda/dcn_v2_psroi_pooling_cuda.cu ================================================ /*! * Copyright (c) 2017 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file deformable_psroi_pooling.cu * \brief * \author Yi Li, Guodong Zhang, Jifeng Dai */ /***************** Adapted by Charles Shang *********************/ #include #include #include #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } template __device__ T bilinear_interp_cuda( const T *data, const T x, const T y, const int width, const int height) { int x1 = floor(x); int x2 = ceil(x); int y1 = floor(y); int y2 = ceil(y); T dist_x = static_cast(x - x1); T dist_y = static_cast(y - y1); T value11 = data[y1 * width + x1]; T value12 = data[y2 * width + x1]; T value21 = data[y1 * width + x2]; T value22 = data[y2 * width + x2]; T value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; return value; } template __global__ void DeformablePSROIPoolForwardKernelCuda( const int count, const T *bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const T *bottom_rois, const T *bottom_trans, const int no_trans, const T trans_std, const int sample_per_part, const int output_dim, const int group_size, const int part_size, const int num_classes, const int channels_each_class, T *top_data, T *top_count) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const T *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 T roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); int part_h = floor(static_cast(ph) / pooled_height * part_size); int part_w = floor(static_cast(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; T wstart = static_cast(pw) * bin_size_w + roi_start_w; wstart += trans_x * roi_width; T hstart = static_cast(ph) * bin_size_h + roi_start_h; hstart += trans_y * roi_height; T sum = 0; int count = 0; int gw = floor(static_cast(pw) * group_size / pooled_width); int gh = floor(static_cast(ph) * group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { T w = wstart + iw * sub_bin_size_w; T h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height); sum += val; count++; } } top_data[index] = count == 0 ? static_cast(0) : sum / count; top_count[index] = count; } } template __global__ void DeformablePSROIPoolBackwardAccKernelCuda( const int count, const T *top_diff, const T *top_count, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int output_dim, T *bottom_data_diff, T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois, const T *bottom_trans, const int no_trans, const T trans_std, const int sample_per_part, const int group_size, const int part_size, const int num_classes, const int channels_each_class) { CUDA_KERNEL_LOOP(index, count) { // The output is in order (n, ctop, ph, pw) int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int ctop = (index / pooled_width / pooled_height) % output_dim; int n = index / pooled_width / pooled_height / output_dim; // [start, end) interval for spatial sampling const T *offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 T roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); int part_h = floor(static_cast(ph) / pooled_height * part_size); int part_w = floor(static_cast(pw) / pooled_width * part_size); int class_id = ctop / channels_each_class; T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; T wstart = static_cast(pw) * bin_size_w + roi_start_w; wstart += trans_x * roi_width; T hstart = static_cast(ph) * bin_size_h + roi_start_h; hstart += trans_y * roi_height; if (top_count[index] <= 0) { continue; } T diff_val = top_diff[index] / top_count[index]; const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; int gw = floor(static_cast(pw) * group_size / pooled_width); int gh = floor(static_cast(ph) * group_size / pooled_height); gw = min(max(gw, 0), group_size - 1); gh = min(max(gh, 0), group_size - 1); for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { T w = wstart + iw * sub_bin_size_w; T h = hstart + ih * sub_bin_size_h; // bilinear interpolation if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { continue; } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); int c = (ctop * group_size + gh) * group_size + gw; // backward on feature int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); int y1 = ceil(h); T dist_x = w - x0, dist_y = h - y0; T q00 = (1 - dist_x) * (1 - dist_y); T q01 = (1 - dist_x) * dist_y; T q10 = dist_x * (1 - dist_y); T q11 = dist_x * dist_y; int bottom_index_base = c * height * width; atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); if (no_trans) { continue; } T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; diff_x *= roi_width; T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); } } } } std::tuple dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); auto pooled_height = pooled_size; auto pooled_width = pooled_size; auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); long out_size = num_bbox * output_dim * pooled_height * pooled_width; auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (out.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(out, top_count); } dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); dim3 block(512); AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { DeformablePSROIPoolForwardKernelCuda<<>>( out_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, bbox.contiguous().data(), trans.contiguous().data(), no_trans, trans_std, sample_per_part, output_dim, group_size, part_size, num_classes, channels_each_class, out.data(), top_count.data()); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(out, top_count); } std::tuple dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const at::Tensor &top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_trans = no_trans ? 2 : trans.size(1); const int num_bbox = bbox.size(0); AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); auto pooled_height = pooled_size; auto pooled_width = pooled_size; long out_size = num_bbox * output_dim * pooled_height * pooled_width; const int num_classes = no_trans ? 1 : channels_trans / 2; const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); auto trans_grad = at::zeros_like(trans); if (input_grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(input_grad, trans_grad); } dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); dim3 block(512); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { DeformablePSROIPoolBackwardAccKernelCuda<<>>( out_size, out_grad.contiguous().data(), top_count.contiguous().data(), num_bbox, spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, input_grad.contiguous().data(), trans_grad.contiguous().data(), input.contiguous().data(), bbox.contiguous().data(), trans.contiguous().data(), no_trans, trans_std, sample_per_part, group_size, part_size, num_classes, channels_each_class); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(input_grad, trans_grad); } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/cuda/vision.h ================================================ #pragma once #include at::Tensor dcn_v2_cuda_forward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int deformable_group); std::vector dcn_v2_cuda_backward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const at::Tensor &grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int deformable_group); std::tuple dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); std::tuple dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const at::Tensor &top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std); ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/dcn_v2.h ================================================ #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif at::Tensor dcn_v2_forward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int deformable_group) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcn_v2_cuda_forward(input, weight, bias, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, deformable_group); #else AT_ERROR("Not compiled with GPU support"); #endif } else{ return dcn_v2_cpu_forward(input, weight, bias, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, deformable_group); } } std::vector dcn_v2_backward(const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias, const at::Tensor &offset, const at::Tensor &mask, const at::Tensor &grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int deformable_group) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcn_v2_cuda_backward(input, weight, bias, offset, mask, grad_output, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, deformable_group); #else AT_ERROR("Not compiled with GPU support"); #endif } else{ return dcn_v2_cpu_backward(input, weight, bias, offset, mask, grad_output, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, deformable_group); } } std::tuple dcn_v2_psroi_pooling_forward(const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcn_v2_psroi_pooling_cuda_forward(input, bbox, trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); #else AT_ERROR("Not compiled with GPU support"); #endif } else{ return dcn_v2_psroi_pooling_cpu_forward(input, bbox, trans, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } } std::tuple dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, const at::Tensor &input, const at::Tensor &bbox, const at::Tensor &trans, const at::Tensor &top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcn_v2_psroi_pooling_cuda_backward(out_grad, input, bbox, trans, top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); #else AT_ERROR("Not compiled with GPU support"); #endif } else{ return dcn_v2_psroi_pooling_cpu_backward(out_grad, input, bbox, trans, top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); } } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/src/vision.cpp ================================================ #include "dcn_v2.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); } ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/testcpu.py ================================================ #!/usr/bin/env python from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from dcn_v2 import dcn_v2_conv, DCNv2, DCN from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling deformable_groups = 1 N, inC, inH, inW = 2, 2, 4, 4 outC = 2 kH, kW = 3, 3 def conv_identify(weight, bias): weight.data.zero_() bias.data.zero_() o, i, h, w = weight.shape y = h//2 x = w//2 for p in range(i): for q in range(o): if p == q: weight.data[q, p, y, x] = 1.0 def check_zero_offset(): conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, kernel_size=(kH, kW), stride=(1, 1), padding=(1, 1), bias=True) conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, kernel_size=(kH, kW), stride=(1, 1), padding=(1, 1), bias=True) dcn_v2 = DCNv2(inC, outC, (kH, kW), stride=1, padding=1, dilation=1, deformable_groups=deformable_groups) conv_offset.weight.data.zero_() conv_offset.bias.data.zero_() conv_mask.weight.data.zero_() conv_mask.bias.data.zero_() conv_identify(dcn_v2.weight, dcn_v2.bias) input = torch.randn(N, inC, inH, inW) offset = conv_offset(input) mask = conv_mask(input) mask = torch.sigmoid(mask) output = dcn_v2(input, offset, mask) output *= 2 d = (input - output).abs().max() if d < 1e-10: print('Zero offset passed') else: print('Zero offset failed') print(input) print(output) def check_gradient_dconv(): input = torch.rand(N, inC, inH, inW) * 0.01 input.requires_grad = True offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW) * 2 # offset.data.zero_() # offset.data -= 0.5 offset.requires_grad = True mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW) # mask.data.zero_() mask.requires_grad = True mask = torch.sigmoid(mask) weight = torch.randn(outC, inC, kH, kW) weight.requires_grad = True bias = torch.rand(outC) bias.requires_grad = True stride = 1 padding = 1 dilation = 1 print('check_gradient_dconv: ', gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups), eps=1e-3, atol=1e-4, rtol=1e-2)) def check_pooling_zero_offset(): input = torch.randn(2, 16, 64, 64).zero_() input[0, :, 16:26, 16:26] = 1. input[1, :, 10:20, 20:30] = 2. rois = torch.tensor([ [0, 65, 65, 103, 103], [1, 81, 41, 119, 79], ]).float() pooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=16, no_trans=True, group_size=1, trans_std=0.0) out = pooling(input, rois, input.new()) s = ', '.join(['%f' % out[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=16, no_trans=False, group_size=1, trans_std=0.0) offset = torch.randn(20, 2, 7, 7).zero_() dout = dpooling(input, rois, offset) s = ', '.join(['%f' % dout[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) def check_gradient_dpooling(): input = torch.randn(2, 3, 5, 5) * 0.01 N = 4 batch_inds = torch.randint(2, (N, 1)).float() x = torch.rand((N, 1)).float() * 15 y = torch.rand((N, 1)).float() * 15 w = torch.rand((N, 1)).float() * 10 h = torch.rand((N, 1)).float() * 10 rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) offset = torch.randn(N, 2, 3, 3) input.requires_grad = True offset.requires_grad = True spatial_scale = 1.0 / 4 pooled_size = 3 output_dim = 3 no_trans = 0 group_size = 1 trans_std = 0.0 sample_per_part = 4 part_size = pooled_size print('check_gradient_dpooling:', gradcheck(dcn_v2_pooling, (input, rois, offset, spatial_scale, pooled_size, output_dim, no_trans, group_size, part_size, sample_per_part, trans_std), eps=1e-4)) def example_dconv(): input = torch.randn(2, 64, 128, 128) # wrap all things (offset and mask) in DCN dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, padding=1, deformable_groups=2) # print(dcn.weight.shape, input.shape) output = dcn(input) targert = output.new(*output.size()) targert.data.uniform_(-0.01, 0.01) error = (targert - output).mean() error.backward() print(output.shape) def example_dpooling(): input = torch.randn(2, 32, 64, 64) batch_inds = torch.randint(2, (20, 1)).float() x = torch.randint(256, (20, 1)).float() y = torch.randint(256, (20, 1)).float() w = torch.randint(64, (20, 1)).float() h = torch.randint(64, (20, 1)).float() rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) offset = torch.randn(20, 2, 7, 7) input.requires_grad = True offset.requires_grad = True # normal roi_align pooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=True, group_size=1, trans_std=0.1) # deformable pooling dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=False, group_size=1, trans_std=0.1) out = pooling(input, rois, offset) dout = dpooling(input, rois, offset) print(out.shape) print(dout.shape) target_out = out.new(*out.size()) target_out.data.uniform_(-0.01, 0.01) target_dout = dout.new(*dout.size()) target_dout.data.uniform_(-0.01, 0.01) e = (target_out - out).mean() e.backward() e = (target_dout - dout).mean() e.backward() def example_mdpooling(): input = torch.randn(2, 32, 64, 64) input.requires_grad = True batch_inds = torch.randint(2, (20, 1)).float() x = torch.randint(256, (20, 1)).float() y = torch.randint(256, (20, 1)).float() w = torch.randint(64, (20, 1)).float() h = torch.randint(64, (20, 1)).float() rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) # mdformable pooling (V2) dpooling = DCNPooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=False, group_size=1, trans_std=0.1, deform_fc_dim=1024) dout = dpooling(input, rois) target = dout.new(*dout.size()) target.data.uniform_(-0.1, 0.1) error = (target - dout).mean() error.backward() print(dout.shape) if __name__ == '__main__': example_dconv() example_dpooling() example_mdpooling() check_pooling_zero_offset() # zero offset check if inC == outC: check_zero_offset() check_gradient_dpooling() check_gradient_dconv() # """ # ****** Note: backward is not reentrant error may not be a serious problem, # ****** since the max error is less than 1e-7, # ****** Still looking for what trigger this problem # """ ================================================ FILE: Network/rigidmask/networks/DCNv2/DCN/testcuda.py ================================================ #!/usr/bin/env python from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from dcn_v2 import dcn_v2_conv, DCNv2, DCN from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling deformable_groups = 1 N, inC, inH, inW = 2, 2, 4, 4 outC = 2 kH, kW = 3, 3 def conv_identify(weight, bias): weight.data.zero_() bias.data.zero_() o, i, h, w = weight.shape y = h//2 x = w//2 for p in range(i): for q in range(o): if p == q: weight.data[q, p, y, x] = 1.0 def check_zero_offset(): conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, kernel_size=(kH, kW), stride=(1, 1), padding=(1, 1), bias=True).cuda() conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, kernel_size=(kH, kW), stride=(1, 1), padding=(1, 1), bias=True).cuda() dcn_v2 = DCNv2(inC, outC, (kH, kW), stride=1, padding=1, dilation=1, deformable_groups=deformable_groups).cuda() conv_offset.weight.data.zero_() conv_offset.bias.data.zero_() conv_mask.weight.data.zero_() conv_mask.bias.data.zero_() conv_identify(dcn_v2.weight, dcn_v2.bias) input = torch.randn(N, inC, inH, inW).cuda() offset = conv_offset(input) mask = conv_mask(input) mask = torch.sigmoid(mask) output = dcn_v2(input, offset, mask) output *= 2 d = (input - output).abs().max() if d < 1e-10: print('Zero offset passed') else: print('Zero offset failed') print(input) print(output) def check_gradient_dconv(): input = torch.rand(N, inC, inH, inW).cuda() * 0.01 input.requires_grad = True offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2 # offset.data.zero_() # offset.data -= 0.5 offset.requires_grad = True mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda() # mask.data.zero_() mask.requires_grad = True mask = torch.sigmoid(mask) weight = torch.randn(outC, inC, kH, kW).cuda() weight.requires_grad = True bias = torch.rand(outC).cuda() bias.requires_grad = True stride = 1 padding = 1 dilation = 1 print('check_gradient_dconv: ', gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups), eps=1e-3, atol=1e-4, rtol=1e-2)) def check_pooling_zero_offset(): input = torch.randn(2, 16, 64, 64).cuda().zero_() input[0, :, 16:26, 16:26] = 1. input[1, :, 10:20, 20:30] = 2. rois = torch.tensor([ [0, 65, 65, 103, 103], [1, 81, 41, 119, 79], ]).cuda().float() pooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=16, no_trans=True, group_size=1, trans_std=0.0).cuda() out = pooling(input, rois, input.new()) s = ', '.join(['%f' % out[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=16, no_trans=False, group_size=1, trans_std=0.0).cuda() offset = torch.randn(20, 2, 7, 7).cuda().zero_() dout = dpooling(input, rois, offset) s = ', '.join(['%f' % dout[i, :, :, :].mean().item() for i in range(rois.shape[0])]) print(s) def check_gradient_dpooling(): input = torch.randn(2, 3, 5, 5).cuda().float() * 0.01 N = 4 batch_inds = torch.randint(2, (N, 1)).cuda().float() x = torch.rand((N, 1)).cuda().float() * 15 y = torch.rand((N, 1)).cuda().float() * 15 w = torch.rand((N, 1)).cuda().float() * 10 h = torch.rand((N, 1)).cuda().float() * 10 rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) offset = torch.randn(N, 2, 3, 3).cuda() input.requires_grad = True offset.requires_grad = True spatial_scale = 1.0 / 4 pooled_size = 3 output_dim = 3 no_trans = 0 group_size = 1 trans_std = 0.0 sample_per_part = 4 part_size = pooled_size print('check_gradient_dpooling:', gradcheck(dcn_v2_pooling, (input, rois, offset, spatial_scale, pooled_size, output_dim, no_trans, group_size, part_size, sample_per_part, trans_std), eps=1e-4)) def example_dconv(): input = torch.randn(2, 64, 128, 128).cuda() # wrap all things (offset and mask) in DCN dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, padding=1, deformable_groups=2).cuda() # print(dcn.weight.shape, input.shape) output = dcn(input) targert = output.new(*output.size()) targert.data.uniform_(-0.01, 0.01) error = (targert - output).mean() error.backward() print(output.shape) def example_dpooling(): input = torch.randn(2, 32, 64, 64).cuda() batch_inds = torch.randint(2, (20, 1)).cuda().float() x = torch.randint(256, (20, 1)).cuda().float() y = torch.randint(256, (20, 1)).cuda().float() w = torch.randint(64, (20, 1)).cuda().float() h = torch.randint(64, (20, 1)).cuda().float() rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) offset = torch.randn(20, 2, 7, 7).cuda() input.requires_grad = True offset.requires_grad = True # normal roi_align pooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=True, group_size=1, trans_std=0.1).cuda() # deformable pooling dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=False, group_size=1, trans_std=0.1).cuda() out = pooling(input, rois, offset) dout = dpooling(input, rois, offset) print(out.shape) print(dout.shape) target_out = out.new(*out.size()) target_out.data.uniform_(-0.01, 0.01) target_dout = dout.new(*dout.size()) target_dout.data.uniform_(-0.01, 0.01) e = (target_out - out).mean() e.backward() e = (target_dout - dout).mean() e.backward() def example_mdpooling(): input = torch.randn(2, 32, 64, 64).cuda() input.requires_grad = True batch_inds = torch.randint(2, (20, 1)).cuda().float() x = torch.randint(256, (20, 1)).cuda().float() y = torch.randint(256, (20, 1)).cuda().float() w = torch.randint(64, (20, 1)).cuda().float() h = torch.randint(64, (20, 1)).cuda().float() rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) # mdformable pooling (V2) dpooling = DCNPooling(spatial_scale=1.0 / 4, pooled_size=7, output_dim=32, no_trans=False, group_size=1, trans_std=0.1, deform_fc_dim=1024).cuda() dout = dpooling(input, rois) target = dout.new(*dout.size()) target.data.uniform_(-0.1, 0.1) error = (target - dout).mean() error.backward() print(dout.shape) if __name__ == '__main__': example_dconv() example_dpooling() example_mdpooling() check_pooling_zero_offset() # zero offset check if inC == outC: check_zero_offset() check_gradient_dpooling() check_gradient_dconv() # """ # ****** Note: backward is not reentrant error may not be a serious problem, # ****** since the max error is less than 1e-7, # ****** Still looking for what trigger this problem # """ ================================================ FILE: Network/rigidmask/networks/DCNv2/LICENSE ================================================ BSD 3-Clause License Copyright (c) 2019, Charles Shang All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Network/rigidmask/networks/DCNv2/README.md ================================================ ## Deformable Convolutional Networks V2 with Pytorch 1.X ### Build ```bash ./make.sh # build python testcpu.py # run examples and gradient check on cpu python testcuda.py # run examples and gradient check on gpu ``` ### Note Now the master branch is for pytorch 1.x, you can switch back to pytorch 0.4 with, ```bash git checkout pytorch_0.4 ``` ### Known Issues: - [x] Gradient check w.r.t offset (solved) - [ ] Backward is not reentrant (minor) This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op). Update: all gradient check passes with **double** precision. Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for float `<1e-15` for double), so it may not be a serious problem (?) Please post an issue or PR if you have any comments. ================================================ FILE: Network/rigidmask/networks/DCNv2/make.sh ================================================ #!/usr/bin/env bash python setup.py build develop ================================================ FILE: Network/rigidmask/networks/DCNv2/setup.py ================================================ #!/usr/bin/env python import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "DCN", "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) #os.environ["CC"] = "g++" sources = main_file + source_cpu extension = CppExtension extra_compile_args = {'cxx': ['-std=c++14']} define_macros = [] #if torch.cuda.is_available() and CUDA_HOME is not None: if torch.cuda.is_available(): extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: #raise NotImplementedError('Cuda is not available') pass sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "_ext", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="DCNv2", version="0.1", author="charlesshang", url="https://github.com/charlesshang/DCNv2", description="deformable convolutional networks", packages=find_packages(exclude=("configs", "tests",)), # install_requires=requirements, ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: Network/rigidmask/networks/dlav0.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function import math from os.path import join import torch from torch import nn import torch.utils.model_zoo as model_zoo import numpy as np BatchNorm = nn.BatchNorm2d def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): def __init__(self, inplanes, planes, stride=1, dilation=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn1 = BatchNorm(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) self.bn2 = BatchNorm(planes) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 2 def __init__(self, inplanes, planes, stride=1, dilation=1): super(Bottleneck, self).__init__() expansion = Bottleneck.expansion bottle_planes = planes // expansion self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = BatchNorm(bottle_planes) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn2 = BatchNorm(bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = BatchNorm(planes) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class BottleneckX(nn.Module): expansion = 2 cardinality = 32 def __init__(self, inplanes, planes, stride=1, dilation=1): super(BottleneckX, self).__init__() cardinality = BottleneckX.cardinality # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) # bottle_planes = dim * cardinality bottle_planes = planes * cardinality // 32 self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = BatchNorm(bottle_planes) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation, groups=cardinality) self.bn2 = BatchNorm(bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = BatchNorm(planes) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class Root(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, residual): super(Root, self).__init__() self.conv = nn.Conv2d( in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) self.bn = BatchNorm(out_channels) self.relu = nn.ReLU(inplace=True) self.residual = residual def forward(self, *x): children = x x = self.conv(torch.cat(x, 1)) x = self.bn(x) if self.residual: x += children[0] x = self.relu(x) return x class Tree(nn.Module): def __init__(self, levels, block, in_channels, out_channels, stride=1, level_root=False, root_dim=0, root_kernel_size=1, dilation=1, root_residual=False): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.tree1 = block(in_channels, out_channels, stride, dilation=dilation) self.tree2 = block(out_channels, out_channels, 1, dilation=dilation) else: self.tree1 = Tree(levels - 1, block, in_channels, out_channels, stride, root_dim=0, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) self.tree2 = Tree(levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) if levels == 1: self.root = Root(root_dim, out_channels, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), BatchNorm(out_channels) ) def forward(self, x, residual=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.levels == 1: x2 = self.tree2(x1) x = self.root(x2, x1, *children) else: children.append(x1) x = self.tree2(x1, children=children) return x class DLA(nn.Module): def __init__(self, levels, channels, num_classes=1000, block=BasicBlock, residual_root=False, return_levels=False, pool_size=7, linear_root=False): super(DLA, self).__init__() self.channels = channels self.return_levels = return_levels self.num_classes = num_classes self.base_layer = nn.Sequential( nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), BatchNorm(channels[0]), nn.ReLU(inplace=True)) self.level0 = self._make_conv_level( channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root) self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root) self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root) self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root) self.avgpool = nn.AvgPool2d(pool_size) self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, BatchNorm): m.weight.data.fill_(1) m.bias.data.zero_() def _make_level(self, block, inplanes, planes, blocks, stride=1): downsample = None if stride != 1 or inplanes != planes: downsample = nn.Sequential( nn.MaxPool2d(stride, stride=stride), nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), BatchNorm(planes), ) layers = [] layers.append(block(inplanes, planes, stride, downsample=downsample)) for i in range(1, blocks): layers.append(block(inplanes, planes)) return nn.Sequential(*layers) def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): modules = [] for i in range(convs): modules.extend([ nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), BatchNorm(planes), nn.ReLU(inplace=True)]) inplanes = planes return nn.Sequential(*modules) def forward(self, x): y = [] x = self.base_layer(x) for i in range(6): x = getattr(self, 'level{}'.format(i))(x) y.append(x) if self.return_levels: return y else: x = self.avgpool(x) x = self.fc(x) x = x.view(x.size(0), -1) return x def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): fc = self.fc if name.endswith('.pth'): model_weights = torch.load(data + name) else: model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) num_classes = len(model_weights[list(model_weights.keys())[-1]]) self.fc = nn.Conv2d( self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) self.load_state_dict(model_weights) self.fc = fc def dla34(pretrained, **kwargs): # DLA-34 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock, **kwargs) if pretrained: model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') return model def dla46_c(pretrained=None, **kwargs): # DLA-46-C Bottleneck.expansion = 2 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=Bottleneck, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla46_c') return model def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla46x_c') return model def dla60x_c(pretrained, **kwargs): # DLA-X-60-C BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs) if pretrained: model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c') return model def dla60(pretrained=None, **kwargs): # DLA-60 Bottleneck.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla60') return model def dla60x(pretrained=None, **kwargs): # DLA-X-60 BottleneckX.expansion = 2 model = DLA([1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla60x') return model def dla102(pretrained=None, **kwargs): # DLA-102 Bottleneck.expansion = 2 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla102') return model def dla102x(pretrained=None, **kwargs): # DLA-X-102 BottleneckX.expansion = 2 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla102x') return model def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64 BottleneckX.cardinality = 64 model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla102x2') return model def dla169(pretrained=None, **kwargs): # DLA-169 Bottleneck.expansion = 2 model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, residual_root=True, **kwargs) if pretrained is not None: model.load_pretrained_model(pretrained, 'dla169') return model def set_bn(bn): global BatchNorm BatchNorm = bn dla.BatchNorm = bn class Identity(nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x): return x def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class IDAUp(nn.Module): def __init__(self, node_kernel, out_dim, channels, up_factors): super(IDAUp, self).__init__() self.channels = channels self.out_dim = out_dim for i, c in enumerate(channels): if c == out_dim: proj = Identity() else: proj = nn.Sequential( nn.Conv2d(c, out_dim, kernel_size=1, stride=1, bias=False), BatchNorm(out_dim), nn.ReLU(inplace=True)) f = int(up_factors[i]) if f == 1: up = Identity() else: up = nn.ConvTranspose2d( out_dim, out_dim, f * 2, stride=f, padding=f // 2, output_padding=0, groups=out_dim, bias=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) for i in range(1, len(channels)): node = nn.Sequential( nn.Conv2d(out_dim * 2, out_dim, kernel_size=node_kernel, stride=1, padding=node_kernel // 2, bias=False), BatchNorm(out_dim), nn.ReLU(inplace=True)) setattr(self, 'node_' + str(i), node) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, BatchNorm): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, layers): assert len(self.channels) == len(layers), \ '{} vs {} layers'.format(len(self.channels), len(layers)) layers = list(layers) for i, l in enumerate(layers): upsample = getattr(self, 'up_' + str(i)) project = getattr(self, 'proj_' + str(i)) layers[i] = upsample(project(l)) x = layers[0] y = [] for i in range(1, len(layers)): node = getattr(self, 'node_' + str(i)) x = node(torch.cat([x, layers[i]], 1)) y.append(x) return x, y class DLAUp(nn.Module): def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None): super(DLAUp, self).__init__() if in_channels is None: in_channels = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr(self, 'ida_{}'.format(i), IDAUp(3, channels[j], in_channels[j:], scales[j:] // scales[j])) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, layers): layers = list(layers) assert len(layers) > 1 for i in range(len(layers) - 1): ida = getattr(self, 'ida_{}'.format(i)) x, y = ida(layers[-i - 2:]) layers[-i - 1:] = y return x def fill_fc_weights(layers): for m in layers.modules(): if isinstance(m, nn.Conv2d): nn.init.normal_(m.weight, std=0.001) # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') # torch.nn.init.xavier_normal_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias, 0) class DLASeg(nn.Module): def __init__(self, base_name, heads, pretrained=True, down_ratio=4, head_conv=256): super(DLASeg, self).__init__() assert down_ratio in [2, 4, 8, 16] self.heads = heads self.first_level = int(np.log2(down_ratio)) self.base = globals()[base_name]( pretrained=pretrained, return_levels=True) channels = self.base.channels scales = [2 ** i for i in range(len(channels[self.first_level:]))] self.dla_up = DLAUp(channels[self.first_level:], scales=scales) ''' self.fc = nn.Sequential( nn.Conv2d(channels[self.first_level], classes, kernel_size=1, stride=1, padding=0, bias=True) ) ''' for head in self.heads: classes = self.heads[head] if head_conv > 0: fc = nn.Sequential( nn.Conv2d(channels[self.first_level], head_conv, kernel_size=3, padding=1, bias=True), nn.ReLU(inplace=True), nn.Conv2d(head_conv, classes, kernel_size=1, stride=1, padding=0, bias=True)) if 'hm' in head: fc[-1].bias.data.fill_(-2.19) else: fill_fc_weights(fc) else: fc = nn.Conv2d(channels[self.first_level], classes, kernel_size=1, stride=1, padding=0, bias=True) if 'hm' in head: fc.bias.data.fill_(-2.19) else: fill_fc_weights(fc) self.__setattr__(head, fc) ''' up_factor = 2 ** self.first_level if up_factor > 1: up = nn.ConvTranspose2d(classes, classes, up_factor * 2, stride=up_factor, padding=up_factor // 2, output_padding=0, groups=classes, bias=False) fill_up_weights(up) up.weight.requires_grad = False else: up = Identity() self.up = up self.softmax = nn.LogSoftmax(dim=1) for m in self.fc.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, BatchNorm): m.weight.data.fill_(1) m.bias.data.zero_() ''' def forward(self, x): x = self.base(x) x = self.dla_up(x[self.first_level:]) # x = self.fc(x) # y = self.softmax(self.up(x)) ret = {} for head in self.heads: ret[head] = self.__getattr__(head)(x) return [ret] ''' def optim_parameters(self, memo=None): for param in self.base.parameters(): yield param for param in self.dla_up.parameters(): yield param for param in self.fc.parameters(): yield param ''' ''' def dla34up(classes, pretrained_base=None, **kwargs): model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs) return model def dla60up(classes, pretrained_base=None, **kwargs): model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs) return model def dla102up(classes, pretrained_base=None, **kwargs): model = DLASeg('dla102', classes, pretrained_base=pretrained_base, **kwargs) return model def dla169up(classes, pretrained_base=None, **kwargs): model = DLASeg('dla169', classes, pretrained_base=pretrained_base, **kwargs) return model ''' def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4): model = DLASeg('dla{}'.format(num_layers), heads, pretrained=True, down_ratio=down_ratio, head_conv=head_conv) return model ================================================ FILE: Network/rigidmask/networks/large_hourglass.py ================================================ # ------------------------------------------------------------------------------ # This code is base on # CornerNet (https://github.com/princeton-vl/CornerNet) # Copyright (c) 2018, University of Michigan # Licensed under the BSD 3-Clause License # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import torch import torch.nn as nn class convolution(nn.Module): def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True): super(convolution, self).__init__() pad = (k - 1) // 2 self.conv = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(pad, pad), stride=(stride, stride), bias=not with_bn) self.bn = nn.BatchNorm2d(out_dim) if with_bn else nn.Sequential() self.relu = nn.ReLU(inplace=True) def forward(self, x): conv = self.conv(x) bn = self.bn(conv) relu = self.relu(bn) return relu class fully_connected(nn.Module): def __init__(self, inp_dim, out_dim, with_bn=True): super(fully_connected, self).__init__() self.with_bn = with_bn self.linear = nn.Linear(inp_dim, out_dim) if self.with_bn: self.bn = nn.BatchNorm1d(out_dim) self.relu = nn.ReLU(inplace=True) def forward(self, x): linear = self.linear(x) bn = self.bn(linear) if self.with_bn else linear relu = self.relu(bn) return relu class residual(nn.Module): def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True): super(residual, self).__init__() self.conv1 = nn.Conv2d(inp_dim, out_dim, (3, 3), padding=(1, 1), stride=(stride, stride), bias=False) self.bn1 = nn.BatchNorm2d(out_dim) self.relu1 = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(out_dim, out_dim, (3, 3), padding=(1, 1), bias=False) self.bn2 = nn.BatchNorm2d(out_dim) self.skip = nn.Sequential( nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False), nn.BatchNorm2d(out_dim) ) if stride != 1 or inp_dim != out_dim else nn.Sequential() self.relu = nn.ReLU(inplace=True) def forward(self, x): conv1 = self.conv1(x) bn1 = self.bn1(conv1) relu1 = self.relu1(bn1) conv2 = self.conv2(relu1) bn2 = self.bn2(conv2) skip = self.skip(x) return self.relu(bn2 + skip) def make_layer(k, inp_dim, out_dim, modules, layer=convolution, **kwargs): layers = [layer(k, inp_dim, out_dim, **kwargs)] for _ in range(1, modules): layers.append(layer(k, out_dim, out_dim, **kwargs)) return nn.Sequential(*layers) def make_layer_revr(k, inp_dim, out_dim, modules, layer=convolution, **kwargs): layers = [] for _ in range(modules - 1): layers.append(layer(k, inp_dim, inp_dim, **kwargs)) layers.append(layer(k, inp_dim, out_dim, **kwargs)) return nn.Sequential(*layers) class MergeUp(nn.Module): def forward(self, up1, up2): return up1 + up2 def make_merge_layer(dim): return MergeUp() # def make_pool_layer(dim): # return nn.MaxPool2d(kernel_size=2, stride=2) def make_pool_layer(dim): return nn.Sequential() def make_unpool_layer(dim): return nn.Upsample(scale_factor=2) def make_kp_layer(cnv_dim, curr_dim, out_dim): return nn.Sequential( convolution(3, cnv_dim, curr_dim, with_bn=False), nn.Conv2d(curr_dim, out_dim, (1, 1)) ) def make_inter_layer(dim): return residual(3, dim, dim) def make_cnv_layer(inp_dim, out_dim): return convolution(3, inp_dim, out_dim) class kp_module(nn.Module): def __init__( self, n, dims, modules, layer=residual, make_up_layer=make_layer, make_low_layer=make_layer, make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr, make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer, make_merge_layer=make_merge_layer, **kwargs ): super(kp_module, self).__init__() self.n = n curr_mod = modules[0] next_mod = modules[1] curr_dim = dims[0] next_dim = dims[1] self.up1 = make_up_layer( 3, curr_dim, curr_dim, curr_mod, layer=layer, **kwargs ) self.max1 = make_pool_layer(curr_dim) self.low1 = make_hg_layer( 3, curr_dim, next_dim, curr_mod, layer=layer, **kwargs ) self.low2 = kp_module( n - 1, dims[1:], modules[1:], layer=layer, make_up_layer=make_up_layer, make_low_layer=make_low_layer, make_hg_layer=make_hg_layer, make_hg_layer_revr=make_hg_layer_revr, make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer, make_merge_layer=make_merge_layer, **kwargs ) if self.n > 1 else \ make_low_layer( 3, next_dim, next_dim, next_mod, layer=layer, **kwargs ) self.low3 = make_hg_layer_revr( 3, next_dim, curr_dim, curr_mod, layer=layer, **kwargs ) self.up2 = make_unpool_layer(curr_dim) self.merge = make_merge_layer(curr_dim) def forward(self, x): up1 = self.up1(x) max1 = self.max1(x) low1 = self.low1(max1) low2 = self.low2(low1) low3 = self.low3(low2) up2 = self.up2(low3) return self.merge(up1, up2) class exkp(nn.Module): def __init__( self, n, nstack, dims, modules, heads, pre=None, cnv_dim=256, make_tl_layer=None, make_br_layer=None, make_cnv_layer=make_cnv_layer, make_heat_layer=make_kp_layer, make_tag_layer=make_kp_layer, make_regr_layer=make_kp_layer, make_up_layer=make_layer, make_low_layer=make_layer, make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr, make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer, make_merge_layer=make_merge_layer, make_inter_layer=make_inter_layer, kp_layer=residual ): super(exkp, self).__init__() self.nstack = nstack self.heads = heads curr_dim = dims[0] self.pre = nn.Sequential( convolution(7, 3, 128, stride=2), residual(3, 128, 256, stride=2) ) if pre is None else pre self.kps = nn.ModuleList([ kp_module( n, dims, modules, layer=kp_layer, make_up_layer=make_up_layer, make_low_layer=make_low_layer, make_hg_layer=make_hg_layer, make_hg_layer_revr=make_hg_layer_revr, make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer, make_merge_layer=make_merge_layer ) for _ in range(nstack) ]) self.cnvs = nn.ModuleList([ make_cnv_layer(curr_dim, cnv_dim) for _ in range(nstack) ]) self.inters = nn.ModuleList([ make_inter_layer(curr_dim) for _ in range(nstack - 1) ]) self.inters_ = nn.ModuleList([ nn.Sequential( nn.Conv2d(curr_dim, curr_dim, (1, 1), bias=False), nn.BatchNorm2d(curr_dim) ) for _ in range(nstack - 1) ]) self.cnvs_ = nn.ModuleList([ nn.Sequential( nn.Conv2d(cnv_dim, curr_dim, (1, 1), bias=False), nn.BatchNorm2d(curr_dim) ) for _ in range(nstack - 1) ]) ## keypoint heatmaps for head in heads.keys(): if 'hm' in head: module = nn.ModuleList([ make_heat_layer( cnv_dim, curr_dim, heads[head]) for _ in range(nstack) ]) self.__setattr__(head, module) for heat in self.__getattr__(head): heat[-1].bias.data.fill_(-2.19) else: module = nn.ModuleList([ make_regr_layer( cnv_dim, curr_dim, heads[head]) for _ in range(nstack) ]) self.__setattr__(head, module) self.relu = nn.ReLU(inplace=True) def forward(self, image): # print('image shape', image.shape) inter = self.pre(image) outs = [] for ind in range(self.nstack): kp_, cnv_ = self.kps[ind], self.cnvs[ind] kp = kp_(inter) cnv = cnv_(kp) out = {} for head in self.heads: layer = self.__getattr__(head)[ind] y = layer(cnv) out[head] = y outs.append(out) if ind < self.nstack - 1: inter = self.inters_[ind](inter) + self.cnvs_[ind](cnv) inter = self.relu(inter) inter = self.inters[ind](inter) return outs def make_hg_layer(kernel, dim0, dim1, mod, layer=convolution, **kwargs): layers = [layer(kernel, dim0, dim1, stride=2)] layers += [layer(kernel, dim1, dim1) for _ in range(mod - 1)] return nn.Sequential(*layers) class HourglassNet(exkp): def __init__(self, heads, num_stacks=2): n = 5 dims = [256, 256, 384, 384, 384, 512] modules = [2, 2, 2, 2, 2, 4] super(HourglassNet, self).__init__( n, num_stacks, dims, modules, heads, make_tl_layer=None, make_br_layer=None, make_pool_layer=make_pool_layer, make_hg_layer=make_hg_layer, kp_layer=residual, cnv_dim=256 ) def get_large_hourglass_net(num_layers, heads, head_conv): model = HourglassNet(heads, 2) return model ================================================ FILE: Network/rigidmask/networks/msra_resnet.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # Modified by Xingyi Zhou # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo BN_MOMENTUM = 0.1 model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class PoseResNet(nn.Module): def __init__(self, block, layers, heads, head_conv, **kwargs): self.inplanes = 64 self.deconv_with_bias = False self.heads = heads super(PoseResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) # used for deconv layers self.deconv_layers = self._make_deconv_layer( 3, [256, 256, 256], [4, 4, 4], ) # self.final_layer = [] for head in sorted(self.heads): num_output = self.heads[head] if head_conv > 0: fc = nn.Sequential( nn.Conv2d(256, head_conv, kernel_size=3, padding=1, bias=True), nn.ReLU(inplace=True), nn.Conv2d(head_conv, num_output, kernel_size=1, stride=1, padding=0)) else: fc = nn.Conv2d( in_channels=256, out_channels=num_output, kernel_size=1, stride=1, padding=0 ) self.__setattr__(head, fc) # self.final_layer = nn.ModuleList(self.final_layer) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def _get_deconv_cfg(self, deconv_kernel, index): if deconv_kernel == 4: padding = 1 output_padding = 0 elif deconv_kernel == 3: padding = 1 output_padding = 1 elif deconv_kernel == 2: padding = 0 output_padding = 0 return deconv_kernel, padding, output_padding def _make_deconv_layer(self, num_layers, num_filters, num_kernels): assert num_layers == len(num_filters), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' assert num_layers == len(num_kernels), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' layers = [] for i in range(num_layers): kernel, padding, output_padding = \ self._get_deconv_cfg(num_kernels[i], i) planes = num_filters[i] layers.append( nn.ConvTranspose2d( in_channels=self.inplanes, out_channels=planes, kernel_size=kernel, stride=2, padding=padding, output_padding=output_padding, bias=self.deconv_with_bias)) layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) layers.append(nn.ReLU(inplace=True)) self.inplanes = planes return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.deconv_layers(x) ret = {} for head in self.heads: ret[head] = self.__getattr__(head)(x) return [ret] def init_weights(self, num_layers, pretrained=True): if pretrained: # print('=> init resnet deconv weights from normal distribution') for _, m in self.deconv_layers.named_modules(): if isinstance(m, nn.ConvTranspose2d): # print('=> init {}.weight as normal(0, 0.001)'.format(name)) # print('=> init {}.bias as 0'.format(name)) nn.init.normal_(m.weight, std=0.001) if self.deconv_with_bias: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # print('=> init {}.weight as 1'.format(name)) # print('=> init {}.bias as 0'.format(name)) nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # print('=> init final conv weights from normal distribution') for head in self.heads: final_layer = self.__getattr__(head) for i, m in enumerate(final_layer.modules()): if isinstance(m, nn.Conv2d): # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # print('=> init {}.weight as normal(0, 0.001)'.format(name)) # print('=> init {}.bias as 0'.format(name)) if m.weight.shape[0] == self.heads[head]: if 'hm' in head: nn.init.constant_(m.bias, -2.19) else: nn.init.normal_(m.weight, std=0.001) nn.init.constant_(m.bias, 0) #pretrained_state_dict = torch.load(pretrained) url = model_urls['resnet{}'.format(num_layers)] pretrained_state_dict = model_zoo.load_url(url) print('=> loading pretrained model {}'.format(url)) self.load_state_dict(pretrained_state_dict, strict=False) else: print('=> imagenet pretrained model dose not exist') print('=> please download it first') raise ValueError('imagenet pretrained model does not exist') resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), 34: (BasicBlock, [3, 4, 6, 3]), 50: (Bottleneck, [3, 4, 6, 3]), 101: (Bottleneck, [3, 4, 23, 3]), 152: (Bottleneck, [3, 8, 36, 3])} def get_pose_net(num_layers, heads, head_conv): block_class, layers = resnet_spec[num_layers] model = PoseResNet(block_class, layers, heads, head_conv=head_conv) model.init_weights(num_layers, pretrained=True) return model ================================================ FILE: Network/rigidmask/networks/pose_dla_dcn.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import math import logging import numpy as np from os.path import join import torch from torch import nn import torch.nn.functional as F import torch.utils.model_zoo as model_zoo from .DCNv2.DCN.dcn_v2 import DCN BN_MOMENTUM = 0.1 logger = logging.getLogger(__name__) def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) def conv3x3(in_planes, out_planes, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): def __init__(self, inplanes, planes, stride=1, dilation=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 2 def __init__(self, inplanes, planes, stride=1, dilation=1): super(Bottleneck, self).__init__() expansion = Bottleneck.expansion bottle_planes = planes // expansion self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class BottleneckX(nn.Module): expansion = 2 cardinality = 32 def __init__(self, inplanes, planes, stride=1, dilation=1): super(BottleneckX, self).__init__() cardinality = BottleneckX.cardinality # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) # bottle_planes = dim * cardinality bottle_planes = planes * cardinality // 32 self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation, groups=cardinality) self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.stride = stride def forward(self, x, residual=None): if residual is None: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) out += residual out = self.relu(out) return out class Root(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, residual): super(Root, self).__init__() self.conv = nn.Conv2d( in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.residual = residual def forward(self, *x): children = x x = self.conv(torch.cat(x, 1)) x = self.bn(x) if self.residual: x += children[0] x = self.relu(x) return x class Tree(nn.Module): def __init__(self, levels, block, in_channels, out_channels, stride=1, level_root=False, root_dim=0, root_kernel_size=1, dilation=1, root_residual=False): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.tree1 = block(in_channels, out_channels, stride, dilation=dilation) self.tree2 = block(out_channels, out_channels, 1, dilation=dilation) else: self.tree1 = Tree(levels - 1, block, in_channels, out_channels, stride, root_dim=0, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) self.tree2 = Tree(levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, root_residual=root_residual) if levels == 1: self.root = Root(root_dim, out_channels, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) ) def forward(self, x, residual=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.levels == 1: x2 = self.tree2(x1) x = self.root(x2, x1, *children) else: children.append(x1) x = self.tree2(x1, children=children) return x class DLA(nn.Module): def __init__(self, levels, channels, num_classes=1000, block=BasicBlock, residual_root=False, linear_root=False,num_input=14): super(DLA, self).__init__() self.channels = channels self.num_classes = num_classes self.base_layer = nn.Sequential( nn.Conv2d(num_input, channels[0], kernel_size=7, stride=1, padding=3, bias=False), nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), nn.ReLU(inplace=True)) self.level0 = self._make_conv_level( channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root) self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root) self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root) self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root) # for m in self.modules(): # if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) # elif isinstance(m, nn.BatchNorm2d): # m.weight.data.fill_(1) # m.bias.data.zero_() def _make_level(self, block, inplanes, planes, blocks, stride=1): downsample = None if stride != 1 or inplanes != planes: downsample = nn.Sequential( nn.MaxPool2d(stride, stride=stride), nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(inplanes, planes, stride, downsample=downsample)) for i in range(1, blocks): layers.append(block(inplanes, planes)) return nn.Sequential(*layers) def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): modules = [] for i in range(convs): modules.extend([ nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), nn.ReLU(inplace=True)]) inplanes = planes return nn.Sequential(*modules) def forward(self, x): y = [] x = self.base_layer(x) for i in range(6): x = getattr(self, 'level{}'.format(i))(x) y.append(x) return y def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): # fc = self.fc if name.endswith('.pth'): model_weights = torch.load(data + name) else: model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) num_classes = len(model_weights[list(model_weights.keys())[-1]]) self.fc = nn.Conv2d( self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) self.load_state_dict(model_weights) # self.fc = fc def dla34(pretrained=True, **kwargs): # DLA-34 model = DLA([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock, **kwargs) if pretrained: model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') return model class Identity(nn.Module): def __init__(self): super(Identity, self).__init__() def forward(self, x): return x def fill_fc_weights(layers): for m in layers.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: nn.init.constant_(m.bias, 0) def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class DeformConv(nn.Module): def __init__(self, chi, cho): super(DeformConv, self).__init__() self.actf = nn.Sequential( nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), nn.ReLU(inplace=True) ) self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) def forward(self, x): x = self.conv(x) x = self.actf(x) return x class IDAUp(nn.Module): def __init__(self, o, channels, up_f): super(IDAUp, self).__init__() for i in range(1, len(channels)): c = channels[i] f = int(up_f[i]) proj = DeformConv(c, o) node = DeformConv(o, o) up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) setattr(self, 'node_' + str(i), node) def forward(self, layers, startp, endp): for i in range(startp + 1, endp): upsample = getattr(self, 'up_' + str(i - startp)) project = getattr(self, 'proj_' + str(i - startp)) layers[i] = upsample(project(layers[i])) node = getattr(self, 'node_' + str(i - startp)) layers[i] = node(layers[i] + layers[i - 1]) class DLAUp(nn.Module): def __init__(self, startp, channels, scales, in_channels=None): super(DLAUp, self).__init__() self.startp = startp if in_channels is None: in_channels = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr(self, 'ida_{}'.format(i), IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j])) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, layers): out = [layers[-1]] # start with 32 for i in range(len(layers) - self.startp - 1): ida = getattr(self, 'ida_{}'.format(i)) ida(layers, len(layers) -i - 2, len(layers)) out.insert(0, layers[-1]) return out class Interpolate(nn.Module): def __init__(self, scale, mode): super(Interpolate, self).__init__() self.scale = scale self.mode = mode def forward(self, x): x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) return x class DLASeg(nn.Module): def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel, last_level, head_conv, out_channel=0,num_input=14): super(DLASeg, self).__init__() assert down_ratio in [2, 4, 8, 16] self.first_level = int(np.log2(down_ratio)) self.last_level = last_level self.base = globals()[base_name](pretrained=pretrained,num_input=num_input) channels = self.base.channels scales = [2 ** i for i in range(len(channels[self.first_level:]))] self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales) if out_channel == 0: out_channel = channels[self.first_level] self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], [2 ** i for i in range(self.last_level - self.first_level)]) self.heads = heads for head in self.heads: classes = self.heads[head] if head_conv > 0: fc = nn.Sequential( nn.Conv2d(channels[self.first_level], head_conv, kernel_size=3, padding=1, bias=True), nn.ReLU(inplace=True), nn.Conv2d(head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) if 'hm' in head: fc[-1].bias.data.fill_(-2.19) else: fill_fc_weights(fc) else: fc = nn.Conv2d(channels[self.first_level], classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True) if 'hm' in head: fc.bias.data.fill_(-2.19) else: fill_fc_weights(fc) self.__setattr__(head, fc) def forward(self, x): x = self.base(x) x = self.dla_up(x) y = [] for i in range(self.last_level - self.first_level): y.append(x[i].clone()) self.ida_up(y, 0, len(y)) z = {} for head in self.heads: z[head] = self.__getattr__(head)(y[-1]) return [z] def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4,num_input=14): model = DLASeg('dla{}'.format(num_layers), heads, pretrained=False, #pretrained=True, down_ratio=down_ratio, final_kernel=1, last_level=5, head_conv=head_conv,num_input=num_input) return model ================================================ FILE: Network/rigidmask/networks/resnet_dcn.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # Modified by Dequan Wang and Xingyi Zhou # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import math import logging import torch import torch.nn as nn from .DCNv2.DCN.dcn_v2 import DCN import torch.utils.model_zoo as model_zoo BN_MOMENTUM = 0.1 logger = logging.getLogger(__name__) model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out def fill_up_weights(up): w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] def fill_fc_weights(layers): for m in layers.modules(): if isinstance(m, nn.Conv2d): nn.init.normal_(m.weight, std=0.001) # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') # torch.nn.init.xavier_normal_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias, 0) class PoseResNet(nn.Module): def __init__(self, block, layers, heads, head_conv): self.inplanes = 64 self.heads = heads self.deconv_with_bias = False super(PoseResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) # used for deconv layers self.deconv_layers = self._make_deconv_layer( 3, [256, 128, 64], [4, 4, 4], ) for head in self.heads: classes = self.heads[head] if head_conv > 0: fc = nn.Sequential( nn.Conv2d(64, head_conv, kernel_size=3, padding=1, bias=True), nn.ReLU(inplace=True), nn.Conv2d(head_conv, classes, kernel_size=1, stride=1, padding=0, bias=True)) if 'hm' in head: fc[-1].bias.data.fill_(-2.19) else: fill_fc_weights(fc) else: fc = nn.Conv2d(64, classes, kernel_size=1, stride=1, padding=0, bias=True) if 'hm' in head: fc.bias.data.fill_(-2.19) else: fill_fc_weights(fc) self.__setattr__(head, fc) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def _get_deconv_cfg(self, deconv_kernel, index): if deconv_kernel == 4: padding = 1 output_padding = 0 elif deconv_kernel == 3: padding = 1 output_padding = 1 elif deconv_kernel == 2: padding = 0 output_padding = 0 return deconv_kernel, padding, output_padding def _make_deconv_layer(self, num_layers, num_filters, num_kernels): assert num_layers == len(num_filters), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' assert num_layers == len(num_kernels), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' layers = [] for i in range(num_layers): kernel, padding, output_padding = \ self._get_deconv_cfg(num_kernels[i], i) planes = num_filters[i] fc = DCN(self.inplanes, planes, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) # fc = nn.Conv2d(self.inplanes, planes, # kernel_size=3, stride=1, # padding=1, dilation=1, bias=False) # fill_fc_weights(fc) up = nn.ConvTranspose2d( in_channels=planes, out_channels=planes, kernel_size=kernel, stride=2, padding=padding, output_padding=output_padding, bias=self.deconv_with_bias) fill_up_weights(up) layers.append(fc) layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) layers.append(nn.ReLU(inplace=True)) layers.append(up) layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) layers.append(nn.ReLU(inplace=True)) self.inplanes = planes return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.deconv_layers(x) ret = {} for head in self.heads: ret[head] = self.__getattr__(head)(x) return [ret] def init_weights(self, num_layers): if 1: url = model_urls['resnet{}'.format(num_layers)] pretrained_state_dict = model_zoo.load_url(url) print('=> loading pretrained model {}'.format(url)) self.load_state_dict(pretrained_state_dict, strict=False) print('=> init deconv weights from normal distribution') for name, m in self.deconv_layers.named_modules(): if isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), 34: (BasicBlock, [3, 4, 6, 3]), 50: (Bottleneck, [3, 4, 6, 3]), 101: (Bottleneck, [3, 4, 23, 3]), 152: (Bottleneck, [3, 8, 36, 3])} def get_pose_net(num_layers, heads, head_conv=256): block_class, layers = resnet_spec[num_layers] model = PoseResNet(block_class, layers, heads, head_conv=head_conv) model.init_weights(num_layers) return model ================================================ FILE: Network/rigidmask/submodule.py ================================================ from __future__ import print_function import torch import torch.nn as nn import torch.utils.data from torch.autograd import Variable import torch.nn.functional as F import math import numpy as np import pdb import kornia class residualBlock(nn.Module): expansion = 1 def __init__(self, in_channels, n_filters, stride=1, downsample=None,dilation=1,with_bn=True): super(residualBlock, self).__init__() if dilation > 1: padding = dilation else: padding = 1 if with_bn: self.convbnrelu1 = conv2DBatchNormRelu(in_channels, n_filters, 3, stride, padding, dilation=dilation) self.convbn2 = conv2DBatchNorm(n_filters, n_filters, 3, 1, 1) else: self.convbnrelu1 = conv2DBatchNormRelu(in_channels, n_filters, 3, stride, padding, dilation=dilation,with_bn=False) self.convbn2 = conv2DBatchNorm(n_filters, n_filters, 3, 1, 1, with_bn=False) self.downsample = downsample self.relu = nn.LeakyReLU(0.1, inplace=True) def forward(self, x): residual = x out = self.convbnrelu1(x) out = self.convbn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual return self.relu(out) def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1): return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True), nn.BatchNorm2d(out_planes), nn.LeakyReLU(0.1,inplace=True)) class conv2DBatchNorm(nn.Module): def __init__(self, in_channels, n_filters, k_size, stride, padding, dilation=1, with_bn=True): super(conv2DBatchNorm, self).__init__() bias = not with_bn if dilation > 1: conv_mod = nn.Conv2d(int(in_channels), int(n_filters), kernel_size=k_size, padding=padding, stride=stride, bias=bias, dilation=dilation) else: conv_mod = nn.Conv2d(int(in_channels), int(n_filters), kernel_size=k_size, padding=padding, stride=stride, bias=bias, dilation=1) if with_bn: self.cb_unit = nn.Sequential(conv_mod, nn.BatchNorm2d(int(n_filters)),) else: self.cb_unit = nn.Sequential(conv_mod,) def forward(self, inputs): outputs = self.cb_unit(inputs) return outputs class conv2DBatchNormRelu(nn.Module): def __init__(self, in_channels, n_filters, k_size, stride, padding, dilation=1, with_bn=True): super(conv2DBatchNormRelu, self).__init__() bias = not with_bn if dilation > 1: conv_mod = nn.Conv2d(int(in_channels), int(n_filters), kernel_size=k_size, padding=padding, stride=stride, bias=bias, dilation=dilation) else: conv_mod = nn.Conv2d(int(in_channels), int(n_filters), kernel_size=k_size, padding=padding, stride=stride, bias=bias, dilation=1) if with_bn: self.cbr_unit = nn.Sequential(conv_mod, nn.BatchNorm2d(int(n_filters)), nn.LeakyReLU(0.1, inplace=True),) else: self.cbr_unit = nn.Sequential(conv_mod, nn.LeakyReLU(0.1, inplace=True),) def forward(self, inputs): outputs = self.cbr_unit(inputs) return outputs class pyramidPooling(nn.Module): def __init__(self, in_channels, with_bn=True, levels=4): super(pyramidPooling, self).__init__() self.levels = levels self.paths = [] for i in range(levels): self.paths.append(conv2DBatchNormRelu(in_channels, in_channels, 1, 1, 0, with_bn=with_bn)) self.path_module_list = nn.ModuleList(self.paths) self.relu = nn.LeakyReLU(0.1, inplace=True) def forward(self, x): h, w = x.shape[2:] k_sizes = [] strides = [] for pool_size in np.linspace(1,min(h,w)//2,self.levels,dtype=int): k_sizes.append((int(h/pool_size), int(w/pool_size))) strides.append((int(h/pool_size), int(w/pool_size))) k_sizes = k_sizes[::-1] strides = strides[::-1] pp_sum = x for i, module in enumerate(self.path_module_list): out = F.avg_pool2d(x, k_sizes[i], stride=strides[i], padding=0) out = module(out) out = F.upsample(out, size=(h,w), mode='bilinear') pp_sum = pp_sum + 1./self.levels*out pp_sum = self.relu(pp_sum/2.) return pp_sum class pspnet(nn.Module): """ Modified PSPNet. https://github.com/meetshah1995/pytorch-semseg/blob/master/ptsemseg/models/pspnet.py """ def __init__(self, is_proj=True,groups=1): super(pspnet, self).__init__() self.inplanes = 32 self.is_proj = is_proj # Encoder self.convbnrelu1_1 = conv2DBatchNormRelu(in_channels=3, k_size=3, n_filters=16, padding=1, stride=2) self.convbnrelu1_2 = conv2DBatchNormRelu(in_channels=16, k_size=3, n_filters=16, padding=1, stride=1) self.convbnrelu1_3 = conv2DBatchNormRelu(in_channels=16, k_size=3, n_filters=32, padding=1, stride=1) # Vanilla Residual Blocks self.res_block3 = self._make_layer(residualBlock,64,1,stride=2) self.res_block5 = self._make_layer(residualBlock,128,1,stride=2) self.res_block6 = self._make_layer(residualBlock,128,1,stride=2) self.res_block7 = self._make_layer(residualBlock,128,1,stride=2) self.pyramid_pooling = pyramidPooling(128, levels=3) # Iconvs self.upconv6 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv5 = conv2DBatchNormRelu(in_channels=192, k_size=3, n_filters=128, padding=1, stride=1) self.upconv5 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv4 = conv2DBatchNormRelu(in_channels=192, k_size=3, n_filters=128, padding=1, stride=1) self.upconv4 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv3 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) self.upconv3 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1)) self.iconv2 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=64, padding=1, stride=1) if self.is_proj: self.proj6 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj5 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj4 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj3 = conv2DBatchNormRelu(in_channels=64, k_size=1,n_filters=64//groups, padding=0,stride=1) self.proj2 = conv2DBatchNormRelu(in_channels=64, k_size=1,n_filters=64//groups, padding=0,stride=1) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if hasattr(m.bias,'data'): m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion),) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): # H, W -> H/2, W/2 conv1 = self.convbnrelu1_1(x) conv1 = self.convbnrelu1_2(conv1) conv1 = self.convbnrelu1_3(conv1) ## H/2, W/2 -> H/4, W/4 pool1 = F.max_pool2d(conv1, 3, 2, 1) # H/4, W/4 -> H/16, W/16 rconv3 = self.res_block3(pool1) conv4 = self.res_block5(rconv3) conv5 = self.res_block6(conv4) conv6 = self.res_block7(conv5) conv6 = self.pyramid_pooling(conv6) conv6x = F.upsample(conv6, [conv5.size()[2],conv5.size()[3]],mode='bilinear') concat5 = torch.cat((conv5,self.upconv6[1](conv6x)),dim=1) conv5 = self.iconv5(concat5) conv5x = F.upsample(conv5, [conv4.size()[2],conv4.size()[3]],mode='bilinear') concat4 = torch.cat((conv4,self.upconv5[1](conv5x)),dim=1) conv4 = self.iconv4(concat4) conv4x = F.upsample(conv4, [rconv3.size()[2],rconv3.size()[3]],mode='bilinear') concat3 = torch.cat((rconv3,self.upconv4[1](conv4x)),dim=1) conv3 = self.iconv3(concat3) conv3x = F.upsample(conv3, [pool1.size()[2],pool1.size()[3]],mode='bilinear') concat2 = torch.cat((pool1,self.upconv3[1](conv3x)),dim=1) conv2 = self.iconv2(concat2) if self.is_proj: proj6 = self.proj6(conv6) proj5 = self.proj5(conv5) proj4 = self.proj4(conv4) proj3 = self.proj3(conv3) proj2 = self.proj2(conv2) return proj6,proj5,proj4,proj3,proj2 else: return conv6, conv5, conv4, conv3, conv2 class pspnet_s(nn.Module): """ Modified PSPNet. https://github.com/meetshah1995/pytorch-semseg/blob/master/ptsemseg/models/pspnet.py """ def __init__(self, is_proj=True,groups=1): super(pspnet_s, self).__init__() self.inplanes = 32 self.is_proj = is_proj # Encoder self.convbnrelu1_1 = conv2DBatchNormRelu(in_channels=3, k_size=3, n_filters=16, padding=1, stride=2) self.convbnrelu1_2 = conv2DBatchNormRelu(in_channels=16, k_size=3, n_filters=16, padding=1, stride=1) self.convbnrelu1_3 = conv2DBatchNormRelu(in_channels=16, k_size=3, n_filters=32, padding=1, stride=1) # Vanilla Residual Blocks self.res_block3 = self._make_layer(residualBlock,64,1,stride=2) self.res_block5 = self._make_layer(residualBlock,128,1,stride=2) self.res_block6 = self._make_layer(residualBlock,128,1,stride=2) self.res_block7 = self._make_layer(residualBlock,128,1,stride=2) self.pyramid_pooling = pyramidPooling(128, levels=3) # Iconvs self.upconv6 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv5 = conv2DBatchNormRelu(in_channels=192, k_size=3, n_filters=128, padding=1, stride=1) self.upconv5 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv4 = conv2DBatchNormRelu(in_channels=192, k_size=3, n_filters=128, padding=1, stride=1) self.upconv4 = nn.Sequential(nn.Upsample(scale_factor=2), conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1)) self.iconv3 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) #self.upconv3 = nn.Sequential(nn.Upsample(scale_factor=2), # conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, # padding=1, stride=1)) #self.iconv2 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=64, # padding=1, stride=1) if self.is_proj: self.proj6 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj5 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj4 = conv2DBatchNormRelu(in_channels=128,k_size=1,n_filters=128//groups, padding=0,stride=1) self.proj3 = conv2DBatchNormRelu(in_channels=64, k_size=1,n_filters=64//groups, padding=0,stride=1) #self.proj2 = conv2DBatchNormRelu(in_channels=64, k_size=1,n_filters=64//groups, padding=0,stride=1) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if hasattr(m.bias,'data'): m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion),) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): # H, W -> H/2, W/2 conv1 = self.convbnrelu1_1(x) conv1 = self.convbnrelu1_2(conv1) conv1 = self.convbnrelu1_3(conv1) ## H/2, W/2 -> H/4, W/4 pool1 = F.max_pool2d(conv1, 3, 2, 1) # H/4, W/4 -> H/16, W/16 rconv3 = self.res_block3(pool1) conv4 = self.res_block5(rconv3) conv5 = self.res_block6(conv4) conv6 = self.res_block7(conv5) conv6 = self.pyramid_pooling(conv6) conv6x = F.upsample(conv6, [conv5.size()[2],conv5.size()[3]],mode='bilinear') concat5 = torch.cat((conv5,self.upconv6[1](conv6x)),dim=1) conv5 = self.iconv5(concat5) conv5x = F.upsample(conv5, [conv4.size()[2],conv4.size()[3]],mode='bilinear') concat4 = torch.cat((conv4,self.upconv5[1](conv5x)),dim=1) conv4 = self.iconv4(concat4) conv4x = F.upsample(conv4, [rconv3.size()[2],rconv3.size()[3]],mode='bilinear') concat3 = torch.cat((rconv3,self.upconv4[1](conv4x)),dim=1) conv3 = self.iconv3(concat3) #conv3x = F.upsample(conv3, [pool1.size()[2],pool1.size()[3]],mode='bilinear') #concat2 = torch.cat((pool1,self.upconv3[1](conv3x)),dim=1) #conv2 = self.iconv2(concat2) if self.is_proj: proj6 = self.proj6(conv6) proj5 = self.proj5(conv5) proj4 = self.proj4(conv4) proj3 = self.proj3(conv3) # proj2 = self.proj2(conv2) # return proj6,proj5,proj4,proj3,proj2 return proj6,proj5,proj4,proj3 else: # return conv6, conv5, conv4, conv3, conv2 return conv6, conv5, conv4, conv3 class bfmodule(nn.Module): def __init__(self, inplanes, outplanes): super(bfmodule, self).__init__() self.proj = conv2DBatchNormRelu(in_channels=inplanes,k_size=1,n_filters=64,padding=0,stride=1) self.inplanes = 64 # Vanilla Residual Blocks self.res_block3 = self._make_layer(residualBlock,64,1,stride=2) self.res_block5 = self._make_layer(residualBlock,64,1,stride=2) self.res_block6 = self._make_layer(residualBlock,64,1,stride=2) self.res_block7 = self._make_layer(residualBlock,128,1,stride=2) self.pyramid_pooling = pyramidPooling(128, levels=3) # Iconvs self.upconv6 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) self.upconv5 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.upconv4 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.upconv3 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.iconv5 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) self.iconv4 = conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1) self.iconv3 = conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1) self.iconv2 = nn.Sequential(conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1), nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True)) self.proj6 = nn.Conv2d(128, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj5 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj4 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj3 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if hasattr(m.bias,'data'): m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion),) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): proj = self.proj(x) # 4x rconv3 = self.res_block3(proj) #8x conv4 = self.res_block5(rconv3) #16x conv5 = self.res_block6(conv4) #32x conv6 = self.res_block7(conv5) #64x conv6 = self.pyramid_pooling(conv6) #64x pred6 = self.proj6(conv6) conv6u = F.upsample(conv6, [conv5.size()[2],conv5.size()[3]], mode='bilinear') concat5 = torch.cat((conv5,self.upconv6(conv6u)),dim=1) conv5 = self.iconv5(concat5) #32x pred5 = self.proj5(conv5) conv5u = F.upsample(conv5, [conv4.size()[2],conv4.size()[3]], mode='bilinear') concat4 = torch.cat((conv4,self.upconv5(conv5u)),dim=1) conv4 = self.iconv4(concat4) #16x pred4 = self.proj4(conv4) conv4u = F.upsample(conv4, [rconv3.size()[2],rconv3.size()[3]], mode='bilinear') concat3 = torch.cat((rconv3,self.upconv4(conv4u)),dim=1) conv3 = self.iconv3(concat3) # 8x pred3 = self.proj3(conv3) conv3u = F.upsample(conv3, [x.size()[2],x.size()[3]], mode='bilinear') concat2 = torch.cat((proj,self.upconv3(conv3u)),dim=1) pred2 = self.iconv2(concat2) # 4x return pred2, pred3, pred4, pred5, pred6 class bfmodule_feat(nn.Module): def __init__(self, inplanes, outplanes): super(bfmodule_feat, self).__init__() self.proj = conv2DBatchNormRelu(in_channels=inplanes,k_size=1,n_filters=64,padding=0,stride=1) self.inplanes = 64 # Vanilla Residual Blocks self.res_block3 = self._make_layer(residualBlock,64,1,stride=2) self.res_block5 = self._make_layer(residualBlock,64,1,stride=2) self.res_block6 = self._make_layer(residualBlock,64,1,stride=2) self.res_block7 = self._make_layer(residualBlock,128,1,stride=2) self.pyramid_pooling = pyramidPooling(128, levels=3) # Iconvs self.upconv6 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) self.upconv5 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.upconv4 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.upconv3 = conv2DBatchNormRelu(in_channels=64, k_size=3, n_filters=32, padding=1, stride=1) self.iconv5 = conv2DBatchNormRelu(in_channels=128, k_size=3, n_filters=64, padding=1, stride=1) self.iconv4 = conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1) self.iconv3 = conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1) self.iconv2 = conv2DBatchNormRelu(in_channels=96, k_size=3, n_filters=64, padding=1, stride=1) self.proj6 = nn.Conv2d(128, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj5 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj4 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj3 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) self.proj2 = nn.Conv2d(64, outplanes,kernel_size=3, stride=1, padding=1, bias=True) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if hasattr(m.bias,'data'): m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion),) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): proj = self.proj(x) # 4x rconv3 = self.res_block3(proj) #8x conv4 = self.res_block5(rconv3) #16x conv5 = self.res_block6(conv4) #32x conv6 = self.res_block7(conv5) #64x conv6 = self.pyramid_pooling(conv6) #64x pred6 = self.proj6(conv6) conv6u = F.upsample(conv6, [conv5.size()[2],conv5.size()[3]], mode='bilinear') concat5 = torch.cat((conv5,self.upconv6(conv6u)),dim=1) conv5 = self.iconv5(concat5) #32x pred5 = self.proj5(conv5) conv5u = F.upsample(conv5, [conv4.size()[2],conv4.size()[3]], mode='bilinear') concat4 = torch.cat((conv4,self.upconv5(conv5u)),dim=1) conv4 = self.iconv4(concat4) #16x pred4 = self.proj4(conv4) conv4u = F.upsample(conv4, [rconv3.size()[2],rconv3.size()[3]], mode='bilinear') concat3 = torch.cat((rconv3,self.upconv4(conv4u)),dim=1) conv3 = self.iconv3(concat3) # 8x pred3 = self.proj3(conv3) conv3u = F.upsample(conv3, [x.size()[2],x.size()[3]], mode='bilinear') concat2 = torch.cat((proj,self.upconv3(conv3u)),dim=1) conv2 = self.iconv2(concat2) # 4x pred2 = self.proj2(conv2) # 4x return pred2, conv2 def compute_geo_costs(rot, trans, Ex, Kinv, hp0, hp1, tau, Kinv_n=None): if Kinv_n is None: Kinv_n = Kinv R01 = kornia.angle_axis_to_rotation_matrix(rot) H01 = Kinv.inverse().matmul(R01).matmul(Kinv_n) comp_hp1 = H01.matmul(hp1.permute(0,2,1)) foe = (comp_hp1-tau*hp0.permute(0,2,1)) parallax3d = Kinv.matmul(foe) p3dmag = parallax3d.norm(2,1)[:,np.newaxis] parallax2d = (comp_hp1/comp_hp1[:,-1:]-hp0.permute(0,2,1))[:,:2] p2dmag = parallax2d.norm(2,1)[:,np.newaxis] p2dnorm = parallax2d / (1e-9+p2dmag) foe_cam = Kinv.inverse().matmul(trans[:,:,np.newaxis]) foe_cam = foe_cam[:,:2] / (1e-9+foe_cam[:,-1:]) direct = foe_cam -hp0.permute(0,2,1)[:,:2] directn = direct / (1e-9+direct.norm(2,1)[:,np.newaxis]) # cost metrics: 0) R-homography+symterr; 1) sampson 2) 2D angular (P+P) 3) 3D distance 4) 3D angular (P+P) ##TODO validate comp_hp0 = H01.inverse().matmul(hp0.permute(0,2,1)) mcost00 = parallax2d.norm(2,1) mcost01 = (comp_hp0/comp_hp0[:,-1:] - hp1.permute(0,2,1))[:,:2].norm(2,1) mcost1 = sampson_err(Kinv.matmul(hp0.permute(0,2,1)), Kinv_n.matmul(hp1.permute(0,2,1)),Ex.cuda().permute(0,2,1)) # variable K mcost2 = -(trans[:,-1:,np.newaxis]).sign()*(directn*p2dnorm).sum(1,keepdims=True) mcost4 = -(trans[:,:,np.newaxis]*parallax3d).sum(1,keepdims=True)/(p3dmag+1e-9) mcost3 = torch.clamp(1-mcost4.pow(2),0,1).sqrt()*p3dmag*mcost4.sign() mcost10 = torch.clamp(1-mcost2.pow(2),0,1).sqrt()*p2dmag*mcost2.sign() return mcost00, mcost01, mcost1, mcost2, mcost3, mcost4, p3dmag, mcost10 def get_skew_mat(transx,rotx): rot = kornia.angle_axis_to_rotation_matrix(rotx) trans = -rot.permute(0,2,1).matmul(transx[:,:,np.newaxis])[:,:,0] rot = rot.permute(0,2,1) tx = torch.zeros(transx.shape[0],3,3) tx[:,0,1] = -transx[:,2] tx[:,0,2] = transx[:,1] tx[:,1,0] = transx[:,2] tx[:,1,2] = -transx[:,0] tx[:,2,0] = -transx[:,1] tx[:,2,1] = transx[:,0] return rot.matmul(tx) def sampson_err(x1h, x2h, F): l2 = F.permute(0,2,1).matmul(x1h) l1 = F.matmul(x2h) algdis = (l1 * x1h).sum(1) dis = algdis**2 / (1e-9+l1[:,0]**2+l1[:,1]**2+l2[:,0]**2+l2[:,1]**2) return dis def get_intrinsics(intr, noise=False): f = intr[0].float() cx = intr[1].float() cy = intr[2].float() if len(intr)>10: # test time dfx= intr[10].float() dfy= intr[11].float() dfx = 1. dfy = 1. else: # train time dfx = 1. dfy = 1. bs = f.shape[0] delta = 1e-4 if noise: fo = f.clone() cxo = cx.clone() cyo = cy.clone() f = torch.Tensor(np.random.normal(loc=0., scale=delta,size=(bs,))).cuda().exp() * fo cx = torch.Tensor(np.random.normal(loc=0.,scale=delta,size=(bs,))).cuda().exp() * cxo cy = torch.Tensor(np.random.normal(loc=0.,scale=delta,size=(bs,))).cuda().exp() * cyo #Kinv = torch.Tensor(np.eye(3)[np.newaxis]).cuda().repeat(bs,1,1) #Kinv[:,2,2] *= f #Kinv[:,0,2] -= cx #Kinv[:,1,2] -= cy #Kinv /= f[:,np.newaxis,np.newaxis] #4,3,3 Kinv = torch.Tensor(np.eye(3)[np.newaxis]).cuda().repeat(bs,1,1) Kinv[:,0,0] = f/dfx Kinv[:,1,1] = f/dfy Kinv[:,0,2] = cx/dfx Kinv[:,1,2] = cy/dfy Kinv = Kinv.inverse() Taug = torch.cat(intr[4:10],-1).view(-1,bs).T # 4,6 Taug = torch.cat((Taug.view(bs,3,2).permute(0,2,1),Kinv[:,2:3]),1) Kinv = Kinv.matmul(Taug) if len(intr)>12: Kinv_n = torch.Tensor(np.eye(3)[np.newaxis]).cuda().repeat(bs,1,1) fn = intr[12].float() #Kinv_n[:,2,2] *= fn #Kinv_n[:,0,2] -= cx #Kinv_n[:,1,2] -= cy #Kinv_n /= fn[:,np.newaxis,np.newaxis] #4,3,3 Kinv_n = torch.Tensor(np.eye(3)[np.newaxis]).cuda().repeat(bs,1,1) Kinv_n[:,0,0] = fn/dfx Kinv_n[:,1,1] = fn/dfy Kinv_n[:,0,2] = cx/dfx Kinv_n[:,1,2] = cy/dfy Kinv_n = Kinv_n.inverse() elif noise: f = torch.Tensor(np.random.normal(loc=0., scale=delta,size=(bs,))).cuda().exp() * fo cx = torch.Tensor(np.random.normal(loc=0.,scale=delta,size=(bs,))).cuda().exp() * cxo cy = torch.Tensor(np.random.normal(loc=0.,scale=delta,size=(bs,))).cuda().exp() * cyo Kinv_n = torch.Tensor(np.eye(3)[np.newaxis]).cuda().repeat(bs,1,1) Kinv_n[:,2,2] *= f Kinv_n[:,0,2] -= cx Kinv_n[:,1,2] -= cy Kinv_n /= f[:,np.newaxis,np.newaxis] #4,3,3 Taug = torch.cat(intr[4:10],-1).view(-1,bs).T # 4,6 Taug = torch.cat((Taug.view(bs,3,2).permute(0,2,1),Kinv_n[:,2:3]),1) Kinv_n = Kinv_n.matmul(Taug) else: Kinv_n = Kinv return Kinv, Kinv_n def testEss(K0,K1,R,T,p1,p2): import cv2 testP = cv2.triangulatePoints(K0.dot(np.concatenate( (np.eye(3),np.zeros((3,1))), -1)), K1.dot(np.concatenate( (R,T), -1)), p1[:2],p2[:2]) Z1 = testP[2,:]/testP[-1,:] Z2 = (R.dot(Z1*np.linalg.inv(K0).dot(p1))+T)[-1,:] if ((Z1>0).sum() > (Z1<=0).sum()) and ((Z2>0).sum() > (Z2<=0).sum()): #print(Z1) #print(Z2) return True else: return False ================================================ FILE: README.md ================================================ # DytanVO: Joint Refinement of Visual Odometry and Motion Segmentation in Dynamic Environments

DytanVO: Joint Refinement of Visual Odometry and Motion Segmentation in Dynamic Environments (ICRA 2023)
By Shihao Shen, Yilin Cai, Wenshan Wang, and Sebastian Scherer.

### What's new. - 01-17-2023: Our paper has been accepted to ICRA 2023! - 01-05-2023: Clean up and upload the codebase for _DytanVO_. Pretrained weights and datasets are also ready. - 09-20-2022: Remove _Dynamic Dense RGB-D SLAM with Learning-Based Visual Odometry_. The repo will be used to release codebase for the most recent ICRA 2023 submission. ## Introduction DytanVO is a learning-based visual odometry (VO) based on its precursor, [TartanVO](https://github.com/castacks/tartanvo). It is the first supervised learning-based VO method that deals with dynamic environments. It takes two consecutive monocular frames in real-time and predicts camera ego-motion in an iterative fashion. It achieves an average improvement of 27.7% over state-of-the-art VO solutions in real-world dynamic environments, and even performs competitively among dynamic visual SLAM systems which optimize the trajectory on the backend. Experiments on plentiful unseen environments also demonstrate its generalizability. ## Installation We provide an environment file using [anaconda](https://www.anaconda.com/). The code has been tested on an RTX 2080Ti with CUDA 11.4. ```bash conda env create -f environment.yml conda activate dytanvo ``` Compile [DCNv2](https://github.com/MatthewHowe/DCNv2). ``` cd Network/rigidmask/networks/DCNv2/; python setup.py install; cd - ``` ## Models and Data ### Pretrained weights Download [here](https://drive.google.com/file/d/1ujYmKv5FHXYe1KETabTnSs-R2OE0KJV3/view?usp=share_link) and unzip it to the `models` folder. ### KITTI dynamic sequences Original sequences in [KITTI Odometry](https://www.cvlibs.net/datasets/kitti/eval_odometry.php) are trimmed into sub-sequences which contain moving pedestrians, vehicles and cyclists so that VO's robustness to dynamic objects can be explicitly evaluated. Download [DynaKITTI](https://drive.google.com/file/d/1BDnraRWzNf938UsfprWIkcqCSfOUyGt9/view?usp=share_link) and unzip it to the `data` folder. Please cite this paper if you find it useful in your work. ### AirDOS-Shibuya Follow [tartanair-shibuya](https://github.com/haleqiu/tartanair-shibuya) and download it to the `data` folder. ### (Optional) Scene Flow One can also test the model on [Scene Flow datasets](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html), which was used to train both the VO and the segmentation networks. Scene Flow datasets have very challenging sequences with large areas of dynamic objects in image frames. You can create symbolic links to wherever the datasets were downloaded in the `data` folder. ```Shell ├── data ├── AirDOS_shibuya ├── RoadCrossing03 ├── image_0 ├── ... ├── gt_pose.txt ├── RoadCrossing04 ├── ... ├── DynaKITTI ├── 00_1 ├── image_2 ├── ... ├── pose_left.txt ├── calib.txt ├── 01_0 ├── ... ├── SceneFlow ├── FlyThings3D ├── frames_cleanpass ├── frames_finalpass ├── optical_flow ├── camera_data ├── Driving ├── Monkaa ├── ... ``` ## Evaluation Create a folder to save output flow, segmentation, or poses. ```bash mkdir results ``` ### Dynamic sequences in KITTI (loading the finetuned VO model at once) ```bash traj=00_1 python -W ignore::UserWarning vo_trajectory_from_folder.py --vo-model-name vonet_ft.pkl \ --seg-model-name segnet-kitti.pth \ --kitti --kitti-intrinsics-file data/DynaKITTI/$traj/calib.txt \ --test-dir data/DynaKITTI/$traj/image_2 \ --pose-file data/DynaKITTI/$traj/pose_left.txt ``` ### AirDOS-Shibuya (loading FlowNet and PoseNet separately) ```bash traj=RoadCrossing03 python -W ignore::UserWarning vo_trajectory_from_folder.py --flow-model-name flownet.pkl \ --pose-model-name posenet.pkl \ --seg-model segnet-sf.pth \ --airdos \ --test-dir data/AirDOS_shibuya/$traj/image_0 \ --pose-file data/AirDOS_shibuya/$traj/gt_pose.txt ``` ### Scene Flow ```bash img=Driving/frames_finalpass/15mm_focallength/scene_forwards/fast/left pose=Driving/camera_data/15mm_focallength/scene_forwards/fast/camera_data.txt python -W ignore::UserWarning vo_trajectory_from_folder.py --flow-model-name flownet.pkl \ --pose-model-name posenet.pkl \ --seg-model segnet-sf.pth \ --sceneflow \ --test-dir data/SceneFlow/$img \ --pose-file data/SceneFlow/$pose ``` Add `--save-flow` tag to save intermediate optical flow outputs into the `results` folder. Adjust the batch size and the worker number by `--batch-size 10`, `--worker-num 5`. ## (Optional) Segmentation Mask Ground Truth If your dataset has ground truth for camera motion, optical flow and disparity change across consecutive frames, we provide an example script to automatically generate ground truth of segmentation mask given these two modalities based on the pure geometry for the [Scene Flow datasets](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html). ```bash python Datasets/segmask_gt.py --database data/SceneFlow --frames_pass clean --dataset FlyingThings3D ``` Add `--debug` flag to save visualizations of the generated masks. ## Citation If you find our code, paper or dataset useful, please cite ```bibtex @inproceedings{shen2023dytanvo, title={Dytanvo: Joint refinement of visual odometry and motion segmentation in dynamic environments}, author={Shen, Shihao and Cai, Yilin and Wang, Wenshan and Scherer, Sebastian}, booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)}, pages={4048--4055}, year={2023}, organization={IEEE} } ``` ## Acknowledgement We built DytanVO on top of [TartanVO](https://github.com/castacks/tartanvo). We implemented the segmentation network by adapting [rigidmask](https://github.com/gengshan-y/rigidmask). We thank [Gengshan Yang](https://gengshan-y.github.io/) for his code and suggestions. ## License This software is BSD licensed. Copyright (c) 2020, Carnegie Mellon University All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: environment.yml ================================================ name: dytanvo channels: - pytorch3d - pytorch - conda-forge - defaults dependencies: - _libgcc_mutex=0.1=main - blas=1.0=mkl - bzip2=1.0.8=h516909a_3 - ca-certificates=2022.5.18.1=ha878542_0 - cairo=1.14.12=h8948797_3 - certifi=2022.5.18.1=py38h578d9bd_0 - colorama=0.4.4=pyh9f0ad1d_0 - cudatoolkit=10.2.89=hfd86e86_1 - cudatoolkit-dev=11.4.0=py38h497a2fe_1 - cycler=0.10.0=py38_0 - dbus=1.13.18=hb2f20db_0 - expat=2.2.10=he6710b0_2 - ffmpeg=4.0.2=ha0c5888_2 - fontconfig=2.13.0=h9420a91_0 - freeglut=3.2.1=h58526e2_0 - freetype=2.10.4=h5ab3b9f_0 - glib=2.63.1=h5a9c865_0 - gmp=6.1.2=hf484d3e_1000 - gnutls=3.5.19=h2a4e5f8_1 - graphite2=1.3.13=h58526e2_1001 - gst-plugins-base=1.14.0=hbbd80ab_1 - gstreamer=1.14.0=hb453b48_1 - harfbuzz=1.8.8=hffaf4a1_0 - hdf5=1.10.2=hc401514_3 - icu=58.2=he6710b0_3 - intel-openmp=2020.2=254 - jasper=2.0.14=h07fcdf6_1 - jpeg=9b=h024ee3a_2 - kiwisolver=1.3.0=py38h2531618_0 - kornia=0.5.3=pyhd8ed1ab_0 - lcms2=2.11=h396b838_0 - ld_impl_linux-64=2.33.1=h53a641e_7 - libedit=3.1.20191231=h14c3975_1 - libffi=3.2.1=hf484d3e_1007 - libgcc-ng=9.1.0=hdf63c60_0 - libgfortran=3.0.0=1 - libgfortran-ng=7.3.0=hdf63c60_0 - libglu=9.0.0=he1b5a44_1001 - libiconv=1.16=h516909a_0 - libopencv=3.4.2=hb342d67_1 - libpng=1.6.37=hbc83047_0 - libstdcxx-ng=9.1.0=hdf63c60_0 - libtiff=4.1.0=h2733197_1 - libuuid=1.0.3=h1bed415_2 - libuv=1.40.0=h7b6447c_0 - libxcb=1.14=h7b6447c_0 - libxml2=2.9.10=hb55368b_3 - lz4-c=1.9.2=heb0550a_3 - matplotlib=3.3.2=0 - matplotlib-base=3.3.2=py38h817c723_0 - mkl=2020.2=256 - mkl-service=2.3.0=py38he904b0f_0 - mkl_fft=1.2.0=py38h23d657b_0 - mkl_random=1.1.1=py38h0573a6f_0 - ncurses=6.2=he6710b0_1 - nettle=3.3=0 - ninja=1.10.2=py38hff7bd54_0 - olefile=0.46=py_0 - openh264=1.8.0=hdbcaa40_1000 - openssl=1.1.1h=h516909a_0 - pcre=8.44=he6710b0_0 - pillow=8.0.1=py38he98fc37_0 - pip=20.3=py38h06a4308_0 - pixman=0.40.0=h36c2ea0_0 - pyparsing=2.4.7=py_0 - pyqt=5.9.2=py38h05f1152_4 - python=3.8.3=cpython_he5300dc_0 - python-dateutil=2.8.2=pyhd8ed1ab_0 - python_abi=3.8=1_cp38 - pytorch=1.7.0=py3.8_cuda10.2.89_cudnn7.6.5_0 - pytorch3d=0.3.0=py38_cu102_pyt170 - pyyaml=5.3.1=py38h8df0ef7_1 - qt=5.9.7=h5867ecd_1 - readline=8.0=h7b6447c_0 - scipy=1.5.2=py38h0b6359f_0 - setuptools=50.3.2=py38h06a4308_2 - sip=4.19.13=py38he6710b0_0 - six=1.15.0=py38h06a4308_0 - sqlite=3.33.0=h62c20be_0 - termcolor=1.1.0=py_2 - tk=8.6.10=hbc83047_0 - torchaudio=0.7.0=py38 - torchvision=0.8.1=py38_cu102 - tornado=6.1=py38h27cfd23_0 - typing_extensions=4.3.0=pyha770c72_0 - wheel=0.36.0=pyhd3eb1b0_0 - x264=1!152.20180806=h14c3975_0 - xorg-fixesproto=5.0=h14c3975_1002 - xorg-inputproto=2.3.2=h14c3975_1002 - xorg-kbproto=1.0.7=h14c3975_1002 - xorg-libx11=1.6.12=h516909a_0 - xorg-libxau=1.0.9=h14c3975_0 - xorg-libxext=1.3.4=h516909a_0 - xorg-libxfixes=5.0.3=h516909a_1004 - xorg-libxi=1.7.10=h516909a_0 - xorg-xextproto=7.3.0=h14c3975_1002 - xorg-xproto=7.0.31=h14c3975_1007 - xz=5.2.5=h7b6447c_0 - yacs=0.1.8=pyhd8ed1ab_0 - yaml=0.2.5=h516909a_0 - zlib=1.2.11=h7b6447c_3 - zstd=1.4.5=h9ceee32_0 - pip: - absl-py==0.11.0 - antlr4-python3-runtime==4.9.3 - appdirs==1.4.4 - beautifulsoup4==4.11.1 - black==21.4b2 - cachetools==4.1.1 - chardet==3.0.4 - charset-normalizer==2.1.1 - cloudpickle==1.6.0 - cupy-cuda102==11.1.0 - cython==0.29.21 - data==0.4 - dataclasses==0.6 - dcnv2==0.1 - decorator==5.1.1 - detectron2==0.5+cu102 - fastrlock==0.8 - filelock==3.8.0 - funcsigs==1.0.2 - future==0.18.2 - fvcore==0.1.2.post20201122 - gdown==4.5.1 - google-auth==1.23.0 - google-auth-oauthlib==0.4.2 - grpcio==1.34.0 - hydra-core==1.2.0 - idna==2.10 - imageio==2.9.0 - importlib-resources==5.9.0 - iopath==0.1.8 - joblib==0.17.0 - jsonpatch==1.32 - jsonpointer==2.3 - latex==0.7.0 - lxml==4.9.1 - markdown==3.3.3 - mypy-extensions==0.4.3 - ngransac==0.0.0 - numpy==1.23.2 - oauthlib==3.1.0 - omegaconf==2.2.3 - opencv-python==4.4.0.46 - packaging==21.3 - pathspec==0.10.1 - portalocker==2.0.0 - protobuf==3.14.0 - pyasn1==0.4.8 - pyasn1-modules==0.2.8 - pycocotools==2.0.4 - pydot==1.4.1 - pypng==0.0.20 - pysocks==1.7.1 - pytransform3d==1.14.0 - pyzmq==23.2.1 - regex==2022.8.17 - requests==2.25.0 - requests-oauthlib==1.3.0 - rsa==4.6 - shutilwhich==1.1.0 - soupsieve==2.3.2.post1 - splines==0.2.0 - tabulate==0.8.7 - tempdir==0.7.1 - tensorboard==2.4.0 - tensorboard-data-server==0.6.1 - tensorboard-plugin-wit==1.7.0 - timm==0.6.7 - toml==0.10.2 - torchfile==0.1.0 - tqdm==4.54.0 - trimesh==3.9.3 - urllib3==1.26.2 - visdom==0.1.8.9 - websocket-client==1.4.0 - werkzeug==1.0.1 - workflow==1.0 - zipp==3.8.1 prefix: /home/shihao/miniconda3/envs/rigidmask_v0 ================================================ FILE: evaluator/__init__.py ================================================ ================================================ FILE: evaluator/evaluate_ate_scale.py ================================================ #!/usr/bin/python # Modified by Wenshan Wang # Modified by Raul Mur-Artal # Automatically compute the optimal scale factor for monocular VO/SLAM. # Software License Agreement (BSD License) # # Copyright (c) 2013, Juergen Sturm, TUM # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of TUM nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # Requirements: # sudo apt-get install python-argparse """ This script computes the absolute trajectory error from the ground truth trajectory and the estimated trajectory. """ import numpy def align(model,data,calc_scale=False): """Align two trajectories using the method of Horn (closed-form). Input: model -- first trajectory (3xn) data -- second trajectory (3xn) Output: rot -- rotation matrix (3x3) trans -- translation vector (3x1) trans_error -- translational error per point (1xn) """ numpy.set_printoptions(precision=3,suppress=True) model_zerocentered = model - model.mean(1) data_zerocentered = data - data.mean(1) W = numpy.zeros( (3,3) ) for column in range(model.shape[1]): W += numpy.outer(model_zerocentered[:,column],data_zerocentered[:,column]) U,d,Vh = numpy.linalg.linalg.svd(W.transpose()) S = numpy.matrix(numpy.identity( 3 )) if(numpy.linalg.det(U) * numpy.linalg.det(Vh)<0): S[2,2] = -1 rot = U*S*Vh if calc_scale: rotmodel = rot*model_zerocentered dots = 0.0 norms = 0.0 for column in range(data_zerocentered.shape[1]): dots += numpy.dot(data_zerocentered[:,column].transpose(),rotmodel[:,column]) normi = numpy.linalg.norm(model_zerocentered[:,column]) norms += normi*normi # s = float(dots/norms) s = float(norms/dots) else: s = 1.0 # trans = data.mean(1) - s*rot * model.mean(1) # model_aligned = s*rot * model + trans # alignment_error = model_aligned - data # scale the est to the gt, otherwise the ATE could be very small if the est scale is small trans = s*data.mean(1) - rot * model.mean(1) model_aligned = rot * model + trans data_alingned = s * data alignment_error = model_aligned - data_alingned trans_error = numpy.sqrt(numpy.sum(numpy.multiply(alignment_error,alignment_error),0)).A[0] return rot,trans,trans_error, s def plot_traj(ax,stamps,traj,style,color,label): """ Plot a trajectory using matplotlib. Input: ax -- the plot stamps -- time stamps (1xn) traj -- trajectory (3xn) style -- line style color -- line color label -- plot legend """ stamps.sort() interval = numpy.median([s-t for s,t in zip(stamps[1:],stamps[:-1])]) x = [] y = [] last = stamps[0] for i in range(len(stamps)): if stamps[i]-last < 2*interval: x.append(traj[i][0]) y.append(traj[i][1]) elif len(x)>0: ax.plot(x,y,style,color=color,label=label) label="" x=[] y=[] last= stamps[i] if len(x)>0: ax.plot(x,y,style,color=color,label=label) ================================================ FILE: evaluator/evaluate_kitti.py ================================================ # Copyright (c) 2020 Carnegie Mellon University, Wenshan Wang # For License information please see the LICENSE file in the root directory. # This is a python reinplementation of the KITTI metric: http://www.cvlibs.net/datasets/kitti/eval_odometry.php # Cridit: Xiangwei Wang https://github.com/TimingSpace import numpy as np import sys def trajectory_distances(poses): distances = [] distances.append(0) for i in range(1,len(poses)): p1 = poses[i-1] p2 = poses[i] delta = p1[0:3,3] - p2[0:3,3] distances.append(distances[i-1]+np.linalg.norm(delta)) return distances def last_frame_from_segment_length(dist,first_frame,length): for i in range(first_frame,len(dist)): if dist[i]>dist[first_frame]+length: return i return -1 def rotation_error(pose_error): a = pose_error[0,0] b = pose_error[1,1] c = pose_error[2,2] d = 0.5*(a+b+c-1) rot_error = np.arccos(max(min(d,1.0),-1.0)) return rot_error def translation_error(pose_error): dx = pose_error[0,3] dy = pose_error[1,3] dz = pose_error[2,3] return np.sqrt(dx*dx+dy*dy+dz*dz) # def line2matrix(pose_line): # pose_line = np.array(pose_line) # pose_m = np.matrix(np.eye(4)) # pose_m[0:3,:] = pose_line.reshape(3,4) # return pose_m def calculate_sequence_error(poses_gt,poses_result,lengths=[10,20,30,40,50,60,70,80]): # error_vetor errors = [] # paramet step_size = 1 #10; # every second num_lengths = len(lengths) # import ipdb;ipdb.set_trace() # pre-compute distances (from ground truth as reference) dist = trajectory_distances(poses_gt) # for all start positions do for first_frame in range(0, len(poses_gt), step_size): # for all segment lengths do for i in range(0,num_lengths): # current length length = lengths[i]; # compute last frame last_frame = last_frame_from_segment_length(dist,first_frame,length); # continue, if sequence not long enough if (last_frame==-1): continue; # compute rotational and translational errors pose_delta_gt = np.linalg.inv(poses_gt[first_frame]).dot(poses_gt[last_frame]) pose_delta_result = np.linalg.inv(poses_result[first_frame]).dot(poses_result[last_frame]) pose_error = np.linalg.inv(pose_delta_result).dot(pose_delta_gt) r_err = rotation_error(pose_error); t_err = translation_error(pose_error); # compute speed num_frames = (float)(last_frame-first_frame+1); speed = length/(0.1*num_frames); # write to file error = [first_frame,r_err/length,t_err/length,length,speed] errors.append(error) # return error vector return errors def calculate_ave_errors(errors,lengths=[10,20,30,40,50,60,70,80]): rot_errors=[] tra_errors=[] for length in lengths: rot_error_each_length =[] tra_error_each_length =[] for error in errors: if abs(error[3]-length)<0.1: rot_error_each_length.append(error[1]) tra_error_each_length.append(error[2]) if len(rot_error_each_length)==0: # import ipdb;ipdb.set_trace() continue else: rot_errors.append(sum(rot_error_each_length)/len(rot_error_each_length)) tra_errors.append(sum(tra_error_each_length)/len(tra_error_each_length)) return np.array(rot_errors)*180/np.pi, tra_errors def evaluate(gt, data,kittitype=True): if kittitype: lens = [100,200,300,400,500,600,700,800] # else: lens = [5,10,15,20,25,30,35,40] #[1,2,3,4,5,6] # errors = calculate_sequence_error(gt, data, lengths=lens) rot,tra = calculate_ave_errors(errors, lengths=lens) return np.mean(rot), np.mean(tra) def main(): # usage: python main.py path_to_ground_truth path_to_predict_pose # load and preprocess data ground_truth_data = np.loadtxt(sys.argv[1]) predict_pose__data = np.loadtxt(sys.argv[2]) errors = calculate_sequence_error(ground_truth_data,predict_pose__data) rot,tra = calculate_ave_errors(errors) print(rot,'\n',tra) #print(error) # evaluate the vo result # save and visualization the evaluatation result if __name__ == "__main__": main() ================================================ FILE: evaluator/evaluate_rpe.py ================================================ #!/usr/bin/python # Software License Agreement (BSD License) # # Modified by Wenshan Wang # Copyright (c) 2013, Juergen Sturm, TUM # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of TUM nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """ This script computes the relative pose error from the ground truth trajectory and the estimated trajectory. """ import random import numpy as np import sys def ominus(a,b): """ Compute the relative 3D transformation between a and b. Input: a -- first pose (homogeneous 4x4 matrix) b -- second pose (homogeneous 4x4 matrix) Output: Relative 3D transformation from a to b. """ return np.dot(np.linalg.inv(a),b) def compute_distance(transform): """ Compute the distance of the translational component of a 4x4 homogeneous matrix. """ return np.linalg.norm(transform[0:3,3]) def compute_angle(transform): """ Compute the rotation angle from a 4x4 homogeneous matrix. """ # an invitation to 3-d vision, p 27 return np.arccos( min(1,max(-1, (np.trace(transform[0:3,0:3]) - 1)/2) )) def distances_along_trajectory(traj): """ Compute the translational distances along a trajectory. """ motion = [ominus(traj[i+1],traj[i]) for i in range(len(traj)-1)] distances = [0] sum = 0 for t in motion: sum += compute_distance(t) distances.append(sum) return distances def evaluate_trajectory(traj_gt, traj_est, param_max_pairs=10000, param_fixed_delta=False, param_delta=1.00): """ Compute the relative pose error between two trajectories. Input: traj_gt -- the first trajectory (ground truth) traj_est -- the second trajectory (estimated trajectory) param_max_pairs -- number of relative poses to be evaluated param_fixed_delta -- false: evaluate over all possible pairs true: only evaluate over pairs with a given distance (delta) param_delta -- distance between the evaluated pairs param_delta_unit -- unit for comparison: "s": seconds "m": meters "rad": radians "deg": degrees "f": frames param_offset -- time offset between two trajectories (to model the delay) param_scale -- scale to be applied to the second trajectory Output: list of compared poses and the resulting translation and rotation error """ if not param_fixed_delta: if(param_max_pairs==0 or len(traj_est)param_max_pairs): pairs = random.sample(pairs,param_max_pairs) result = [] for i,j in pairs: error44 = ominus( ominus( traj_est[j], traj_est[i] ), ominus( traj_gt[j], traj_gt[i] ) ) trans = compute_distance(error44) rot = compute_angle(error44) result.append([i,j,trans,rot]) if len(result)<2: raise Exception("Couldn't find pairs between groundtruth and estimated trajectory!") return result ================================================ FILE: evaluator/evaluator_base.py ================================================ # Copyright (c) 2020 Carnegie Mellon University, Wenshan Wang # For License information please see the LICENSE file in the root directory. import numpy as np from .trajectory_transform import trajectory_transform, rescale from .transformation import pos_quats2SE_matrices, SE2pos_quat from .evaluate_ate_scale import align, plot_traj from .evaluate_rpe import evaluate_trajectory from .evaluate_kitti import evaluate as kittievaluate np.set_printoptions(suppress=True, precision=2, threshold=100000) def transform_trajs(gt_traj, est_traj, cal_scale): gt_traj, est_traj = trajectory_transform(gt_traj, est_traj) if cal_scale : est_traj, s = rescale(gt_traj, est_traj) print(' Scale, {}'.format(s)) else: s = 1.0 return gt_traj, est_traj, s def quats2SEs(gt_traj, est_traj): gt_SEs = pos_quats2SE_matrices(gt_traj) est_SEs = pos_quats2SE_matrices(est_traj) return gt_SEs, est_SEs def per_frame_scale_alignment(gt_motions, est_motions): dist_gt = np.linalg.norm(gt_motions[:,:3], axis=1) # scale the output frame by frame motions_scale = est_motions.copy() dist = np.linalg.norm(motions_scale[:,:3],axis=1) scale_gt = dist_gt/dist motions_scale[:,:3] = est_motions[:,:3] * scale_gt.reshape(-1,1) return motions_scale class ATEEvaluator(object): def __init__(self): super(ATEEvaluator, self).__init__() def evaluate(self, gt_traj, est_traj, scale): gt_xyz = np.matrix(gt_traj[:,0:3].transpose()) est_xyz = np.matrix(est_traj[:, 0:3].transpose()) rot, trans, trans_error, s = align(gt_xyz, est_xyz, scale) print(' ATE scale: {}'.format(s)) error = np.sqrt(np.dot(trans_error,trans_error) / len(trans_error)) # align two trajs est_SEs = pos_quats2SE_matrices(est_traj) T = np.eye(4) T[:3,:3] = rot T[:3,3:] = trans T = np.linalg.inv(T) est_traj_aligned = [] for se in est_SEs: se[:3,3] = se[:3,3] * s se_new = T.dot(se) se_new = SE2pos_quat(se_new) est_traj_aligned.append(se_new) est_traj_aligned = np.array(est_traj_aligned) return error, gt_traj, est_traj_aligned # ======================= class RPEEvaluator(object): def __init__(self): super(RPEEvaluator, self).__init__() def evaluate(self, gt_SEs, est_SEs): result = evaluate_trajectory(gt_SEs, est_SEs) trans_error = np.array(result)[:,2] rot_error = np.array(result)[:,3] trans_error_mean = np.mean(trans_error) rot_error_mean = np.mean(rot_error) # import ipdb;ipdb.set_trace() return (rot_error_mean, trans_error_mean) # ======================= class KittiEvaluator(object): def __init__(self): super(KittiEvaluator, self).__init__() # return rot_error, tra_error def evaluate(self, gt_SEs, est_SEs, kittitype): # trajectory_scale(est_SEs, 0.831984631412) error = kittievaluate(gt_SEs, est_SEs, kittitype=kittitype) return error ================================================ FILE: evaluator/tartanair_evaluator.py ================================================ # Copyright (c) 2020 Carnegie Mellon University, Wenshan Wang # For License information please see the LICENSE file in the root directory. import numpy as np from .evaluator_base import ATEEvaluator, RPEEvaluator, KittiEvaluator, transform_trajs, quats2SEs from os.path import isdir, isfile # from trajectory_transform import timestamp_associate class TartanAirEvaluator: def __init__(self, scale = False, round=1): self.ate_eval = ATEEvaluator() self.rpe_eval = RPEEvaluator() self.kitti_eval = KittiEvaluator() def evaluate_one_trajectory(self, gt_traj, est_traj, scale=False, kittitype=True): """ scale = True: calculate a global scale """ # load trajectories try: gt_traj = np.loadtxt(gt_traj) est_traj = np.loadtxt(est_traj) except: pass if gt_traj.shape[0] != est_traj.shape[0]: raise Exception("POSEFILE_LENGTH_ILLEGAL") if gt_traj.shape[1] != 7 or est_traj.shape[1] != 7: raise Exception("POSEFILE_FORMAT_ILLEGAL") # transform and scale gt_traj_trans, est_traj_trans, s = transform_trajs(gt_traj, est_traj, scale) gt_SEs, est_SEs = quats2SEs(gt_traj_trans, est_traj_trans) ate_score, gt_ate_aligned, est_ate_aligned = self.ate_eval.evaluate(gt_traj, est_traj, scale) rpe_score = self.rpe_eval.evaluate(gt_SEs, est_SEs) kitti_score = self.kitti_eval.evaluate(gt_SEs, est_SEs, kittitype=kittitype) return {'ate_score': ate_score, 'rpe_score': rpe_score, 'kitti_score': kitti_score, 'gt_aligned': gt_ate_aligned, 'est_aligned': est_ate_aligned} if __name__ == "__main__": # scale = True for monocular track, scale = False for stereo track aicrowd_evaluator = TartanAirEvaluator() result = aicrowd_evaluator.evaluate_one_trajectory('pose_gt.txt', 'pose_est.txt', scale=True) print(result) ================================================ FILE: evaluator/trajectory_transform.py ================================================ # Copyright (c) 2020 Carnegie Mellon University, Wenshan Wang # For License information please see the LICENSE file in the root directory. import numpy as np from .transformation import pos_quats2SE_matrices, SE2pos_quat, pose2motion, motion2pose def shift0(traj): ''' Traj: a list of [t + quat] Return: translate and rotate the traj ''' traj_ses = pos_quats2SE_matrices(np.array(traj)) traj_init = traj_ses[0] traj_init_inv = np.linalg.inv(traj_init) new_traj = [] for tt in traj_ses: ttt=traj_init_inv.dot(tt) new_traj.append(SE2pos_quat(ttt)) return np.array(new_traj) def ned2cam(traj): ''' transfer a ned traj to camera frame traj ''' T = np.array([[0,1,0,0], [0,0,1,0], [1,0,0,0], [0,0,0,1]], dtype=np.float32) T_inv = np.linalg.inv(T) new_traj = [] traj_ses = pos_quats2SE_matrices(np.array(traj)) for tt in traj_ses: ttt=T.dot(tt).dot(T_inv) new_traj.append(SE2pos_quat(ttt)) return np.array(new_traj) def cam2ned(traj): ''' transfer a camera traj to ned frame traj ''' T = np.array([[0,0,1,0], [1,0,0,0], [0,1,0,0], [0,0,0,1]], dtype=np.float32) T_inv = np.linalg.inv(T) new_traj = [] traj_ses = pos_quats2SE_matrices(np.array(traj)) for tt in traj_ses: ttt=T.dot(tt).dot(T_inv) new_traj.append(SE2pos_quat(ttt)) return np.array(new_traj) def trajectory_transform(gt_traj, est_traj): ''' 1. center the start frame to the axis origin 2. align the GT frame (NED) with estimation frame (camera) ''' gt_traj_trans = shift0(gt_traj) est_traj_trans = shift0(est_traj) # gt_traj_trans = ned2cam(gt_traj_trans) # est_traj_trans = cam2ned(est_traj_trans) return gt_traj_trans, est_traj_trans def rescale_bk(poses_gt, poses): motion_gt = pose2motion(poses_gt) motion = pose2motion(poses) speed_square_gt = np.sum(motion_gt[:,0:3,3]*motion_gt[:,0:3,3],1) speed_gt = np.sqrt(speed_square_gt) speed_square = np.sum(motion[:,0:3,3]*motion[:,0:3,3],1) speed = np.sqrt(speed_square) # when the speed is small, the scale could become very large # import ipdb;ipdb.set_trace() mask = (speed_gt>0.0001) # * (speed>0.00001) scale = np.mean((speed[mask])/speed_gt[mask]) scale = 1.0/scale motion[:,0:3,3] = motion[:,0:3,3]*scale pose_update = motion2pose(motion) return pose_update, scale def pose2trans(pose_data): data_size = len(pose_data) trans = [] for i in range(0,data_size-1): tran = np.array(pose_data[i+1][:3]) - np.array(pose_data[i][:3]) # np.linalg.inv(data[i]).dot(data[i+1]) trans.append(tran) return np.array(trans) # N x 3 def rescale(poses_gt, poses): ''' similar to rescale poses_gt/poses: N x 7 poselist in quaternion format ''' trans_gt = pose2trans(poses_gt) trans = pose2trans(poses) speed_square_gt = np.sum(trans_gt*trans_gt,1) speed_gt = np.sqrt(speed_square_gt) speed_square = np.sum(trans*trans,1) speed = np.sqrt(speed_square) # when the speed is small, the scale could become very large # import ipdb;ipdb.set_trace() mask = (speed_gt>0.0001) # * (speed>0.00001) scale = np.mean((speed[mask])/speed_gt[mask]) scale = 1.0/scale poses[:,0:3] = poses[:,0:3]*scale return poses, scale def trajectory_scale(traj, scale): for ttt in traj: ttt[0:3,3] = ttt[0:3,3]*scale return traj def timestamp_associate(first_list, second_list, max_difference): """ Associate two trajectory of [stamp,data]. As the time stamps never match exactly, we aim to find the closest match for every input tuple. Input: first_list -- first list of (stamp,data) second_list -- second list of (stamp,data) max_difference -- search radius for candidate generation Output: first_res: matched data from the first list second_res: matched data from the second list """ first_dict = dict([(l[0],l[1:]) for l in first_list if len(l)>1]) second_dict = dict([(l[0],l[1:]) for l in second_list if len(l)>1]) first_keys = first_dict.keys() second_keys = second_dict.keys() potential_matches = [(abs(a - b ), a, b) for a in first_keys for b in second_keys if abs(a - b) < max_difference] potential_matches.sort() matches = [] for diff, a, b in potential_matches: if a in first_keys and b in second_keys: first_keys.remove(a) second_keys.remove(b) matches.append((a, b)) matches.sort() first_res = [] second_res = [] for t1, t2 in matches: first_res.append(first_dict[t1]) second_res.append(second_dict[t2]) return np.array(first_res), np.array(second_res) ================================================ FILE: evaluator/transformation.py ================================================ # Copyright (c) 2020 Carnegie Mellon University, Wenshan Wang # For License information please see the LICENSE file in the root directory. # Credit: Xiangwei Wang https://github.com/TimingSpace import numpy as np from scipy.spatial.transform import Rotation as R def line2mat(line_data): ''' 12 -> 4 x 4 ''' mat = np.eye(4) mat[0:3,:] = line_data.reshape(3,4) return np.matrix(mat) def mat2line(mat_data): ''' 4 x 4 -> 12 ''' line_data = np.zeros(12) line_data[:]=mat_data[:3,:].reshape((12)) return line_data def motion2pose(data): ''' data: N x 12 all_pose: (N+1) x 12 ''' data_size = data.shape[0] all_pose = np.zeros((data_size+1,12)) temp = np.eye(4,4).reshape(1,16) all_pose[0,:] = temp[0,0:12] pose = np.matrix(np.eye(4,4)) for i in range(0,data_size): data_mat = line2mat(data[i,:]) pose = pose*data_mat pose_line = np.array(pose[0:3,:]).reshape(1,12) all_pose[i+1,:] = pose_line return all_pose def pose2motion(data, skip=0): ''' data: N x 12 all_motion (N-1-skip) x 12 ''' data_size = data.shape[0] all_motion = np.zeros((data_size-1,12)) for i in range(0,data_size-1-skip): pose_curr = line2mat(data[i,:]) pose_next = line2mat(data[i+1+skip,:]) motion = pose_curr.I*pose_next motion_line = np.array(motion[0:3,:]).reshape(1,12) all_motion[i,:] = motion_line return all_motion def SE2se(SE_data): result = np.zeros((6)) result[0:3] = np.array(SE_data[0:3,3].T) result[3:6] = SO2so(SE_data[0:3,0:3]).T return result def SO2so(SO_data): return R.from_matrix(SO_data).as_rotvec() def so2SO(so_data): return R.from_rotvec(so_data).as_matrix() def se2SE(se_data): result_mat = np.matrix(np.eye(4)) result_mat[0:3,0:3] = so2SO(se_data[3:6]) result_mat[0:3,3] = np.matrix(se_data[0:3]).T return result_mat ### can get wrong result def se_mean(se_datas): all_SE = np.matrix(np.eye(4)) for i in range(se_datas.shape[0]): se = se_datas[i,:] SE = se2SE(se) all_SE = all_SE*SE all_se = SE2se(all_SE) mean_se = all_se/se_datas.shape[0] return mean_se def ses_mean(se_datas): se_datas = np.array(se_datas) se_datas = np.transpose(se_datas.reshape(se_datas.shape[0],se_datas.shape[1],se_datas.shape[2]*se_datas.shape[3]),(0,2,1)) se_result = np.zeros((se_datas.shape[0],se_datas.shape[2])) for i in range(0,se_datas.shape[0]): mean_se = se_mean(se_datas[i,:,:]) se_result[i,:] = mean_se return se_result def ses2poses(data): data_size = data.shape[0] all_pose = np.zeros((data_size+1,12)) temp = np.eye(4,4).reshape(1,16) all_pose[0,:] = temp[0,0:12] pose = np.matrix(np.eye(4,4)) for i in range(0,data_size): data_mat = se2SE(data[i,:]) pose = pose*data_mat pose_line = np.array(pose[0:3,:]).reshape(1,12) all_pose[i+1,:] = pose_line return all_pose def ses2poses_quat(data): ''' ses: N x 6 ''' data_size = data.shape[0] all_pose_quat = np.zeros((data_size+1,7)) all_pose_quat[0,:] = np.array([0., 0., 0., 0., 0., 0., 1.]) pose = np.matrix(np.eye(4,4)) for i in range(0,data_size): data_mat = se2SE(data[i,:]) pose = pose*data_mat quat = SO2quat(pose[0:3,0:3]) all_pose_quat[i+1,:3] = np.array([pose[0,3], pose[1,3], pose[2,3]]) all_pose_quat[i+1,3:] = quat return all_pose_quat def SEs2ses(motion_data): data_size = motion_data.shape[0] ses = np.zeros((data_size,6)) for i in range(0,data_size): SE = np.matrix(np.eye(4)) SE[0:3,:] = motion_data[i,:].reshape(3,4) ses[i,:] = SE2se(SE) return ses def so2quat(so_data): so_data = np.array(so_data) theta = np.sqrt(np.sum(so_data*so_data)) axis = so_data/theta quat=np.zeros(4) quat[0:3] = np.sin(theta/2)*axis quat[3] = np.cos(theta/2) return quat def quat2so(quat_data): quat_data = np.array(quat_data) sin_half_theta = np.sqrt(np.sum(quat_data[0:3]*quat_data[0:3])) axis = quat_data[0:3]/sin_half_theta cos_half_theta = quat_data[3] theta = 2*np.arctan2(sin_half_theta,cos_half_theta) so = theta*axis return so # input so_datas batch*channel*height*width # return quat_datas batch*numner*channel def sos2quats(so_datas,mean_std=[[1],[1]]): so_datas = np.array(so_datas) so_datas = so_datas.reshape(so_datas.shape[0],so_datas.shape[1],so_datas.shape[2]*so_datas.shape[3]) so_datas = np.transpose(so_datas,(0,2,1)) quat_datas = np.zeros((so_datas.shape[0],so_datas.shape[1],4)) for i_b in range(0,so_datas.shape[0]): for i_p in range(0,so_datas.shape[1]): so_data = so_datas[i_b,i_p,:] quat_data = so2quat(so_data) quat_datas[i_b,i_p,:] = quat_data return quat_datas def SO2quat(SO_data): rr = R.from_matrix(SO_data) return rr.as_quat() def quat2SO(quat_data): return R.from_quat(quat_data).as_matrix() def pos_quat2SE(quat_data): SO = R.from_quat(quat_data[3:7]).as_matrix() SE = np.matrix(np.eye(4)) SE[0:3,0:3] = np.matrix(SO) SE[0:3,3] = np.matrix(quat_data[0:3]).T SE = np.array(SE[0:3,:]).reshape(1,12) return SE def pos_quats2SEs(quat_datas): data_len = quat_datas.shape[0] SEs = np.zeros((data_len,12)) for i_data in range(0,data_len): SE = pos_quat2SE(quat_datas[i_data,:]) SEs[i_data,:] = SE return SEs def pos_quats2SE_matrices(quat_datas): data_len = quat_datas.shape[0] SEs = [] for quat in quat_datas: SO = R.from_quat(quat[3:7]).as_matrix() SE = np.eye(4) SE[0:3,0:3] = SO SE[0:3,3] = quat[0:3] SEs.append(SE) return SEs def SE2pos_quat(SE_data): pos_quat = np.zeros(7) pos_quat[3:] = SO2quat(SE_data[0:3,0:3]) pos_quat[:3] = SE_data[0:3,3].T return pos_quat def SEs2ses(data): ''' data: N x 12 ses: N x 6 ''' data_size = data.shape[0] ses = np.zeros((data_size,6)) for i in range(0,data_size): ses[i,:] = SE2se(line2mat(data[i])) return ses def ses2SEs(data): ''' data: N x 6 SEs: N x 12 ''' data_size = data.shape[0] SEs = np.zeros((data_size,12)) for i in range(0,data_size): SEs[i,:] = mat2line(se2SE(data[i])) return SEs def SE2quat(SE_data): ''' SE_data: 4 x 4 quat: 7 ''' pos_quat = np.zeros(7) pos_quat[3:] = SO2quat(SE_data[0:3,0:3]) pos_quat[:3] = SE_data[0:3,3].T return pos_quat def quat2SE(quat_data): ''' quat_data: 7 SE: 4 x 4 ''' SO = R.from_quat(quat_data[3:7]).as_matrix() SE = np.matrix(np.eye(4)) SE[0:3,0:3] = np.matrix(SO) SE[0:3,3] = np.matrix(quat_data[0:3]).T return SE def SEs2quats(SEs_data): ''' SE_data: N x 12 quat: N x 7 ''' data_len = SEs_data.shape[0] all_quats = np.zeros((data_len,7)) for i in range(0,data_len): SE = line2mat(SEs_data[i]) all_quats[i] = SE2quat(SE) return all_quats def quats2SEs(quat_datas): ''' pos_quats: N x 7 SEs: N x 12 ''' data_len = quat_datas.shape[0] SEs = np.zeros((data_len,12)) for i_data in range(0,data_len): SE = quat2SE(quat_datas[i_data,:]) SEs[i_data,:] = mat2line(SE) return SEs def motion_ses2pose_quats(data): ''' data: N x 6 motion data poses_quat: (N+1) x 7 pose data ''' motions_SEs = ses2SEs(data) # N x 6 -> N x 12 poses_SEs = motion2pose(motions_SEs) # N x 12 -> (N + 1) x 12 poses_quat = SEs2quats(poses_SEs) # (N + 1) x 12 -> (N+1) x 7 return poses_quat def pose_quats2motion_ses(data): ''' data: N x 7 pose list motions: (N-1-skip) x 6 se3 list ''' poses_SEs = quats2SEs(data) # N x 7 -> N x 12 matrix = pose2motion(poses_SEs) # N x 12 -> (N-1-skip) x 12 motions = SEs2ses(matrix).astype(np.float32) # (N-1-skip) x 12 -> (N-1-skip) x 6 return motions def kitti2tartan(traj): ''' traj: in kitti style, N x 12 numpy array, in camera frame output: in TartanAir style, N x 7 numpy array, in NED frame ''' T = np.array([[0,0,1,0], [1,0,0,0], [0,1,0,0], [0,0,0,1]], dtype=np.float32) T_inv = np.linalg.inv(T) new_traj = [] for pose in traj: tt = np.eye(4) tt[:3,:] = pose.reshape(3,4) ttt=T.dot(tt).dot(T_inv) new_traj.append(SE2pos_quat(ttt)) return np.array(new_traj) def tartan2kitti(traj): T = np.array([[0,1,0,0], [0,0,1,0], [1,0,0,0], [0,0,0,1]], dtype=np.float32) T_inv = np.linalg.inv(T) new_traj = [] for pose in traj: tt = np.eye(4) tt[:3,:] = pos_quat2SE(pose).reshape(3,4) ttt=T.dot(tt).dot(T_inv) new_traj.append(ttt[:3,:].reshape(12)) return np.array(new_traj) ================================================ FILE: vo_trajectory_from_folder.py ================================================ from torch.utils.data import DataLoader from Datasets.utils import ToTensor, Compose, CropCenter, ResizeData, dataset_intrinsics, DownscaleFlow from Datasets.utils import plot_traj, visflow, load_kiiti_intrinsics, load_sceneflow_extrinsics from Datasets.tartanTrajFlowDataset import TrajFolderDataset from evaluator.transformation import pose_quats2motion_ses, motion_ses2pose_quats from evaluator.tartanair_evaluator import TartanAirEvaluator from evaluator.evaluator_base import per_frame_scale_alignment from DytanVO import DytanVO import argparse import numpy as np import cv2 from os import mkdir from os.path import isdir def get_args(): parser = argparse.ArgumentParser(description='Inference code of DytanVO') parser.add_argument('--batch-size', type=int, default=1, help='batch size (default: 1)') parser.add_argument('--worker-num', type=int, default=1, help='data loader worker number (default: 1)') parser.add_argument('--image-width', type=int, default=640, help='image width (default: 640)') parser.add_argument('--image-height', type=int, default=448, help='image height (default: 448)') parser.add_argument('--vo-model-name', default='', help='name of pretrained VO model (default: "")') parser.add_argument('--flow-model-name', default='', help='name of pretrained flow model (default: "")') parser.add_argument('--pose-model-name', default='', help='name of pretrained pose model (default: "")') parser.add_argument('--seg-model-name', default='', help='name of pretrained segmentation model (default: "")') parser.add_argument('--airdos', action='store_true', default=False, help='airdos test (default: False)') parser.add_argument('--rs_d435', action='store_true', default=False, help='realsense d435i test (default: False)') parser.add_argument('--sceneflow', action='store_true', default=False, help='sceneflow test (default: False)') parser.add_argument('--kitti', action='store_true', default=False, help='kitti test (default: False)') parser.add_argument('--commaai', action='store_true', default=False, help='commaai test (default: False)') parser.add_argument('--kitti-intrinsics-file', default='', help='kitti intrinsics file calib.txt (default: )') parser.add_argument('--test-dir', default='', help='test trajectory folder where the RGB images are (default: "")') parser.add_argument('--pose-file', default='', help='test trajectory gt pose file, used for scale calculation, and visualization (default: "")') parser.add_argument('--save-flow', action='store_true', default=False, help='save optical flow (default: False)') parser.add_argument('--seg-thresh', type=float, default=0.7, help='threshold for motion segmentation') parser.add_argument('--iter-num', type=int, default=2, help='number of iterations') args = parser.parse_args() return args if __name__ == '__main__': args = get_args() testvo = DytanVO(args.vo_model_name, args.seg_model_name, args.image_height, args.image_width, args.kitti, args.flow_model_name, args.pose_model_name) # load trajectory data from a folder if args.kitti: datastr = 'kitti' elif args.airdos: datastr = 'airdos' elif args.rs_d435: datastr = 'rs_d435' elif args.sceneflow: datastr = 'sceneflow' elif args.commaai: datastr = 'commaai' else: datastr = 'tartanair' focalx, focaly, centerx, centery, baseline = dataset_intrinsics(datastr, '15mm' in args.test_dir) if args.kitti_intrinsics_file.endswith('.txt') and datastr == 'kitti': focalx, focaly, centerx, centery, baseline = load_kiiti_intrinsics(args.kitti_intrinsics_file) if datastr == 'kitti': transform = Compose([ResizeData((args.image_height, 1226)), CropCenter((args.image_height, args.image_width)), DownscaleFlow(), ToTensor()]) else: transform = Compose([CropCenter((args.image_height, args.image_width)), DownscaleFlow(), ToTensor()]) testDataset = TrajFolderDataset(args.test_dir, transform=transform, focalx=focalx, focaly=focaly, centerx=centerx, centery=centery) testDataloader = DataLoader(testDataset, batch_size=args.batch_size, shuffle=False, num_workers=args.worker_num) testDataiter = iter(testDataloader) motionlist = [] testname = datastr + '_' + args.vo_model_name.split('.')[0] + '_' + args.test_dir.split('/')[-1] if args.save_flow: flowdir = 'results/'+testname+'_flow' if not isdir(flowdir): mkdir(flowdir) flowcount = 0 while True: try: sample = testDataiter.next() except StopIteration: break motion, flow = testvo.test_batch(sample, [focalx, centerx, centery, baseline], args.seg_thresh, args.iter_num) motionlist.append(motion) if args.save_flow: for k in range(flow.shape[0]): flowk = flow[k].transpose(1,2,0) np.save(flowdir+'/'+str(flowcount).zfill(6)+'.npy',flowk) flow_vis = visflow(flowk) cv2.imwrite(flowdir+'/'+str(flowcount).zfill(6)+'.png',flow_vis) flowcount += 1 motions = np.array(motionlist) # calculate ATE, RPE, KITTI-RPE if args.pose_file.endswith('.txt'): if datastr == 'sceneflow': gtposes = load_sceneflow_extrinsics(args.pose_file) else: gtposes = np.loadtxt(args.pose_file) if datastr == 'airdos': gtposes = gtposes[:,1:] # remove the first column of timestamps gtmotions = pose_quats2motion_ses(gtposes) estmotion_scale = per_frame_scale_alignment(gtmotions, motions) estposes = motion_ses2pose_quats(estmotion_scale) evaluator = TartanAirEvaluator() results = evaluator.evaluate_one_trajectory(gtposes, estposes, scale=True, kittitype=(datastr=='kitti')) print("==> ATE: %.4f,\t KITTI-R/t: %.4f, %.4f" %(results['ate_score'], results['kitti_score'][0], results['kitti_score'][1])) # save results and visualization plot_traj(results['gt_aligned'], results['est_aligned'], vis=False, savefigname='results/'+testname+'.png', title='ATE %.4f' %(results['ate_score'])) np.savetxt('results/'+testname+'.txt',results['est_aligned']) else: np.savetxt('results/'+testname+'.txt', motion_ses2pose_quats(motions))