Repository: chaytonmin/Off-Road-Freespace-Detection Branch: main Commit: 50e63d248361 Files: 48 Total size: 129.3 KB Directory structure: gitextract_w4yckm1z/ ├── LICENSE ├── README.md ├── data/ │ ├── ORFD_dataset.py │ ├── __init__.py │ ├── base_data_loader.py │ └── base_dataset.py ├── datasets/ │ └── palette.txt ├── demo.py ├── examples/ │ └── y0613_1242/ │ └── calib/ │ ├── 1623721491895.txt │ ├── 1623721491991.txt │ ├── 1623721492091.txt │ ├── 1623721492191.txt │ ├── 1623721492290.txt │ └── 1623721492790.txt ├── models/ │ ├── __init__.py │ ├── base_model.py │ ├── loss.py │ ├── roadseg_model.py │ ├── sne_model.py │ └── transformer_models/ │ ├── __init__.py │ ├── backbones/ │ │ ├── __init__.py │ │ └── transformer.py │ ├── decode_heads/ │ │ ├── __init__.py │ │ ├── decode_head.py │ │ └── head.py │ └── utils/ │ ├── __init__.py │ ├── drop.py │ ├── inverted_residual.py │ ├── make_divisible.py │ ├── norm.py │ ├── res_layer.py │ ├── se_layer.py │ ├── self_attention_block.py │ └── up_conv_block.py ├── options/ │ ├── __init__.py │ ├── base_options.py │ ├── test_options.py │ └── train_options.py ├── road_hesai40_process.py ├── runs/ │ └── Sep12_23-13-24_minchen-GE66-Raider-10SFS/ │ └── events.out.tfevents.1631502804.minchen-GE66-Raider-10SFS ├── scripts/ │ ├── .train.sh.swp │ ├── demo.sh │ ├── test.sh │ └── train.sh ├── test.py ├── train.py └── util/ ├── __init__.py └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2021 Chen Min Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # ORFD: A Dataset and Benchmark for Off-Road Freespace Detection Repository for our ICRA 2022 paper ["ORFD: A Dataset and Benchmark for Off-Road Freespace Detection"](https://arxiv.org/abs/2206.09907). Email:mincheng@ict.ac.cn ## 📰 News * **[2025.12.29]** Our new dataset [ORAD-3D](https://github.com/chaytonmin/ORAD-3D-Dataset-For-Off-Road-Autonomous-Driving) for off-road autonomous driving is publicly released. ## Introduction Freespace detection is an essential component of autonomous driving technology and plays an important role in trajectory planning. In the last decade, deep learning based freespace detection methods have been proved feasible. However, these efforts were focused on urban road environments and few deep learning based methods were specifically designed for off-road freespace detection due to the lack of off-road dataset and benchmark. In this paper, we present the ORFD dataset, which, to our knowledge, is the first off-road freespace detection dataset. The dataset was collected in different scenes (woodland, farmland, grassland and countryside), ndifferent weather conditions (sunny, rainy, foggy and snowy) and different light conditions (bright light, daylight, twilight, darkness), which totally contains 12,198 LiDAR point cloud and RGB image pairs with the traversable area, non-traversable area and unreachable area annotated in detail. We propose a novel network named OFF-Net, which unifies Transformer architecture to aggregate local and global information, to meet the requirement of large receptive fields for freespace detection task. We also propose the cross-attention to dynamically fuse LiDAR and RGB image information for accurate off-road freespace detection.

demo 1

demo 2

demo 3

## Requirements - python 3.6 - pytorch 1.7 - pip install mmcv, mmsegmentation - pip install mmcv-full==1.3.16 -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html ## Pretrained models The pretrained models of our OFF-Net trained on ORFD dataset can be download [here](https://drive.google.com/drive/folders/1lnm2M1HEkVs9W3-FSEX3ddE9GYz4rqCU). ## Prepare data The proposed off-road freespace detection dataset ORFD can be found [torrent](https://academictorrents.com/collection/traversability-analysis-datasets), [BaiduYun](https://pan.baidu.com/s/1GCy2unPzs4K7_PBgZLPO4w?pwd=1234) (code:1234, about 30GB), and [Google Drive](https://github.com/chaytonmin/Off-Road-Freespace-Detection/issues/6#issuecomment-1347324143). Extract and organize as follows: ``` |-- datasets | |-- ORFD | | |-- training | | | |-- sequence |-- calib | | | |-- sparse_depth | | | |-- dense_depth | | | |-- lidar_data | | | |-- image_data | | | |-- gt_image ...... | | |-- validation ...... | | |-- testing ...... ``` The LiDAR coordinate system is x facing left, y facing forward, and z facing up. Generate depth map: python road_hesai40_process.py ## Usage ### Demo ### ``` bash ./scripts/demo.sh ``` ### Training ``` bash ./scripts/train.sh ``` ### Testing ``` bash ./scripts/test.sh ``` ## License Our code and dataset are released under the Apache 2.0 license. ## Acknowledgement This repository is based on [SNE-RoadSeg](https://github.com/hlwang1124/SNE-RoadSeg) [1], and [SegFormer](https://github.com/NVlabs/SegFormer) [2]. ## References [1] Fan, Rui, et al. "Sne-roadseg: Incorporating surface normal information into semantic segmentation for accurate freespace detection." *European Conference on Computer Vision*. Springer, Cham, 2020. [2] Xie, Enze, et al. "SegFormer: Simple and efficient design for semantic segmentation with transformers." Advances in Neural Information Processing Systems 34 (2021). ================================================ FILE: data/ORFD_dataset.py ================================================ import os.path import torchvision.transforms as transforms import torch import cv2 import numpy as np import glob from data.base_dataset import BaseDataset from models.sne_model import SNE class orfdCalibInfo(): """ Read calibration files in the ORFD dataset, we need to use the intrinsic parameter """ def __init__(self, filepath): """ Args: filepath ([str]): calibration file path (AAA.txt) """ self.data = self._load_calib(filepath) def get_cam_param(self): """ Returns: [numpy.array]: intrinsic parameter """ return self.data['K'] def _load_calib(self, filepath): rawdata = self._read_calib_file(filepath) data = {} K = np.reshape(rawdata['cam_K'], (3,3)) data['K'] = K return data def _read_calib_file(self, filepath): """Read in a calibration file and parse into a dictionary.""" data = {} with open(filepath, 'r') as f: for line in f.readlines(): key, value = line.split(':', 1) # The only non-float values in these files are dates, which # we don't care about anyway try: data[key] = np.array([float(x) for x in value.split()]) except ValueError: pass return data class orfddataset(BaseDataset): """dataloader for ORFD dataset""" @staticmethod def modify_commandline_options(parser, is_train): return parser def initialize(self, opt): self.opt = opt self.batch_size = opt.batch_size self.root = opt.dataroot # path for the dataset self.use_sne = opt.use_sne self.num_labels = 2 self.use_size = (opt.useWidth, opt.useHeight) if self.use_sne: self.sne_model = SNE() if opt.phase == "train": self.image_list = sorted(glob.glob(os.path.join(self.root, 'training', '*','image_data', '*.png'))) # shape: 1280*720 elif opt.phase == "val": self.image_list = sorted(glob.glob(os.path.join(self.root, 'validation', '*','image_data', '*.png'))) else: self.image_list = sorted(glob.glob(os.path.join(self.root, 'testing', '*','image_data', '*.png'))) def __getitem__(self, index): useDir = "/".join(self.image_list[index].split('/')[:-2]) name = self.image_list[index].split('/')[-1] rgb_image = cv2.cvtColor(cv2.imread(os.path.join(useDir, 'image_data', name)), cv2.COLOR_BGR2RGB) depth_image = cv2.imread(os.path.join(useDir, 'dense_depth', name), cv2.IMREAD_ANYDEPTH) oriHeight, oriWidth, _ = rgb_image.shape if self.opt.phase == 'test' and self.opt.no_label: # Since we have no gt label (e.g., kitti submission), we generate pseudo gt labels to # avoid destroying the code architecture label = np.zeros((oriHeight, oriWidth), dtype=np.uint8) else: #label_image = cv2.cvtColor(cv2.imread(os.path.join(useDir, 'gt_image_2', name[:-10]+'road_'+name[-10:])), cv2.COLOR_BGR2RGB) label_img_name = name.split('.')[0]+"_fillcolor.png" label_dir = os.path.join(useDir, 'gt_image', label_img_name) label_image = cv2.cvtColor(cv2.imread(label_dir), cv2.COLOR_BGR2RGB) label = np.zeros((oriHeight, oriWidth), dtype=np.uint8) label[label_image[:,:,2] > 200] = 1 # resize image to enable sizes divide 32 rgb_image = cv2.resize(rgb_image, self.use_size) label = cv2.resize(label, (int(self.use_size[0]/4), int(self.use_size[1]/4)), interpolation=cv2.INTER_NEAREST) # another_image will be normal when using SNE, otherwise will be depth if self.use_sne: calib = orfdCalibInfo(os.path.join(useDir, 'calib', name.split('.')[0] +'.txt')) camParam = torch.tensor(calib.get_cam_param(), dtype=torch.float32) camParam[1, 2] = camParam[1, 2] - 8 # 720-16=704 #normal = self.sne_model(torch.tensor(depth_image.astype(np.float32)/1000), camParam) normal = self.sne_model(torch.tensor(depth_image.astype(np.float32)/256), camParam) another_image = normal.cpu().numpy() another_image = np.transpose(another_image, [1, 2, 0]) another_image = cv2.resize(another_image, self.use_size) else: another_image = depth_image.astype(np.float32)/65535 another_image = cv2.resize(another_image, self.use_size) another_image = another_image[:,:,np.newaxis] label[label > 0] = 1 rgb_image = rgb_image.astype(np.float32) / 255 rgb_image = transforms.ToTensor()(rgb_image) another_image = transforms.ToTensor()(another_image) label = torch.from_numpy(label) label = label.type(torch.LongTensor) # return a dictionary containing useful information # input rgb images, another images and labels for training; # 'path': image name for saving predictions # 'oriSize': original image size for evaluating and saving predictions return {'rgb_image': rgb_image, 'another_image': another_image, 'label': label, 'path': name, 'oriSize': (oriWidth, oriHeight)} def __len__(self): return len(self.image_list) def name(self): return 'orfd' ================================================ FILE: data/__init__.py ================================================ import importlib import torch.utils.data from data.base_data_loader import BaseDataLoader from data.base_dataset import BaseDataset import numpy def find_dataset_using_name(dataset_name): # Given the option --dataset [datasetname], # the file "data/datasetname_dataset.py" # will be imported. dataset_filename = "data." + dataset_name + "_dataset" datasetlib = importlib.import_module(dataset_filename) # In the file, the class called DatasetNameDataset() will # be instantiated. It has to be a subclass of BaseDataset, # and it is case-insensitive. dataset = None target_dataset_name = dataset_name.replace('_', '') + 'dataset' for name, cls in datasetlib.__dict__.items(): if name.lower() == target_dataset_name.lower() \ and issubclass(cls, BaseDataset): dataset = cls if dataset is None: print("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name)) exit(0) return dataset def get_option_setter(dataset_name): dataset_class = find_dataset_using_name(dataset_name) return dataset_class.modify_commandline_options def create_dataset(opt): dataset = find_dataset_using_name(opt.dataset) instance = dataset() instance.initialize(opt) print("dataset [%s] was created" % (instance.name())) return instance def CreateDataLoader(opt): data_loader = CustomDatasetDataLoader() data_loader.initialize(opt) return data_loader # Wrapper class of Dataset class that performs # multi-threaded data loading class CustomDatasetDataLoader(BaseDataLoader): def name(self): return 'CustomDatasetDataLoader' def initialize(self, opt): BaseDataLoader.initialize(self, opt) self.dataset = create_dataset(opt) ''' self.dataloader = torch.utils.data.DataLoader( self.dataset, batch_size=opt.batch_size, shuffle=not opt.serial_batches, num_workers=int(opt.num_threads), worker_init_fn=lambda worker_id: numpy.random.seed(opt.seed + worker_id)) ''' self.dataloader = torch.utils.data.DataLoader( self.dataset, batch_size=opt.batch_size, shuffle=not opt.serial_batches, num_workers= 0, worker_init_fn=lambda worker_id: numpy.random.seed(opt.seed + worker_id)) def load_data(self): return self def __len__(self): return len(self.dataset) def __iter__(self): for i, data in enumerate(self.dataloader): yield data ================================================ FILE: data/base_data_loader.py ================================================ class BaseDataLoader(): def __init__(self): pass def initialize(self, opt): self.opt = opt pass def load_data(): return None ================================================ FILE: data/base_dataset.py ================================================ import torch.utils.data as data class BaseDataset(data.Dataset): def __init__(self): super(BaseDataset, self).__init__() def name(self): return 'BaseDataset' @staticmethod def modify_commandline_options(parser, is_train): return parser def initialize(self, opt): pass def __len__(self): return 0 ================================================ FILE: datasets/palette.txt ================================================ 0 0 0 0 128 0 128 0 0 128 128 0 0 0 128 128 0 128 0 128 128 128 128 128 64 0 0 192 0 0 64 128 0 192 128 0 64 0 128 192 0 128 64 128 128 192 128 128 0 64 0 128 64 0 0 192 0 128 192 0 0 64 128 128 64 128 0 192 128 128 192 128 64 64 0 192 64 0 64 192 0 192 192 0 64 64 128 192 64 128 64 192 128 192 192 128 0 0 64 128 0 64 0 128 64 128 128 64 0 0 192 128 0 192 0 128 192 128 128 192 64 0 64 192 0 64 64 128 64 192 128 64 64 0 192 192 0 192 64 128 192 192 128 192 0 64 64 128 64 64 0 192 64 128 192 64 0 64 192 128 64 192 0 192 192 128 192 192 64 64 64 192 64 64 64 192 64 192 192 64 64 64 192 192 64 192 64 192 192 192 192 192 32 0 0 160 0 0 32 128 0 160 128 0 32 0 128 160 0 128 32 128 128 160 128 128 96 0 0 224 0 0 96 128 0 224 128 0 96 0 128 224 0 128 96 128 128 224 128 128 32 64 0 160 64 0 32 192 0 160 192 0 32 64 128 160 64 128 32 192 128 160 192 128 96 64 0 224 64 0 96 192 0 224 192 0 96 64 128 224 64 128 96 192 128 224 192 128 32 0 64 160 0 64 32 128 64 160 128 64 32 0 192 160 0 192 32 128 192 160 128 192 96 0 64 224 0 64 96 128 64 224 128 64 96 0 192 224 0 192 96 128 192 224 128 192 32 64 64 160 64 64 32 192 64 160 192 64 32 64 192 160 64 192 32 192 192 160 192 192 96 64 64 224 64 64 96 192 64 224 192 64 96 64 192 224 64 192 96 192 192 224 192 192 0 32 0 128 32 0 0 160 0 128 160 0 0 32 128 128 32 128 0 160 128 128 160 128 64 32 0 192 32 0 64 160 0 192 160 0 64 32 128 192 32 128 64 160 128 192 160 128 0 96 0 128 96 0 0 224 0 128 224 0 0 96 128 128 96 128 0 224 128 128 224 128 64 96 0 192 96 0 64 224 0 192 224 0 64 96 128 192 96 128 64 224 128 192 224 128 0 32 64 128 32 64 0 160 64 128 160 64 0 32 192 128 32 192 0 160 192 128 160 192 64 32 64 192 32 64 64 160 64 192 160 64 64 32 192 192 32 192 64 160 192 192 160 192 0 96 64 128 96 64 0 224 64 128 224 64 0 96 192 128 96 192 0 224 192 128 224 192 64 96 64 192 96 64 64 224 64 192 224 64 64 96 192 192 96 192 64 224 192 192 224 192 32 32 0 160 32 0 32 160 0 160 160 0 32 32 128 160 32 128 32 160 128 160 160 128 96 32 0 224 32 0 96 160 0 224 160 0 96 32 128 224 32 128 96 160 128 224 160 128 32 96 0 160 96 0 32 224 0 160 224 0 32 96 128 160 96 128 32 224 128 160 224 128 96 96 0 224 96 0 96 224 0 224 224 0 96 96 128 224 96 128 96 224 128 224 224 128 32 32 64 160 32 64 32 160 64 160 160 64 32 32 192 160 32 192 32 160 192 160 160 192 96 32 64 224 32 64 96 160 64 224 160 64 96 32 192 224 32 192 96 160 192 224 160 192 32 96 64 160 96 64 32 224 64 160 224 64 32 96 192 160 96 192 32 224 192 160 224 192 96 96 64 224 96 64 96 224 64 224 224 64 96 96 192 224 96 192 96 224 192 224 224 192 ================================================ FILE: demo.py ================================================ import os from options.test_options import TestOptions from models import create_model from util.util import tensor2labelim, tensor2confidencemap from models.sne_model import SNE import torchvision.transforms as transforms import torch import numpy as np import cv2 import copy import tqdm import glob class dataset(): def __init__(self): self.num_labels = 2 def load_calib(filepath): rawdata = read_calib_file(filepath) K = np.reshape(rawdata['cam_K'], (3,3)) K[1, 2] = K[1, 2] - 8 # 720-16=704 return K def read_calib_file(filepath): """Read in a calibration file and parse into a dictionary.""" data = {} with open(filepath, 'r') as f: for line in f.readlines(): key, value = line.split(':', 1) # The only non-float values in these files are dates, which # we don't care about anyway try: data[key] = np.array([float(x) for x in value.split()]) except ValueError: pass return data if __name__ == '__main__': opt = TestOptions().parse() opt.num_threads = 1 opt.batch_size = 1 opt.serial_batches = True # no shuffle opt.isTrain = False example_dataset = dataset() model = create_model(opt, example_dataset) model.setup(opt) model.eval() # if you want to use your own data, please modify seq_name, image_data, dense_depth, calib and use_size correspondingly. use_size = (1280, 704) root_dir = 'examples' seq_name = 'y0613_1242' # Need to be changed save_dir = os.path.join(root_dir, seq_name, 'results') os.makedirs(save_dir, exist_ok=True) img_list = glob.glob(os.path.join(root_dir, seq_name, 'image_data', '*.png')) for img_list_i in img_list: img_name = img_list_i.split('/')[-1].split('.')[0] print('img_name:',img_name) rgb_image = cv2.cvtColor(cv2.imread(os.path.join(root_dir, seq_name, 'image_data', img_name+'.png')), cv2.COLOR_BGR2RGB) depth_image = cv2.imread(os.path.join(root_dir, seq_name, 'dense_depth', img_name+'.png'), cv2.IMREAD_ANYDEPTH) oriHeight, oriWidth, _ = rgb_image.shape oriSize = (oriWidth, oriHeight) rgb_image_save = copy.copy(rgb_image) rgb_image_save_ = copy.copy(rgb_image) # resize image to enable sizes divide 32 rgb_image = cv2.resize(rgb_image, use_size) rgb_image = rgb_image.astype(np.float32) / 255 # compute normal using SNE sne_model = SNE() calib_dir = os.path.join(root_dir, seq_name, 'calib', img_name+'.txt') camParam = load_calib(calib_dir) normal = sne_model(torch.tensor(depth_image.astype(np.float32)/256), camParam) normal_image = normal.cpu().numpy() normal_image = np.transpose(normal_image, [1, 2, 0]) cv2.imwrite(os.path.join(os.path.join(save_dir, img_name+'_normal.png')), cv2.cvtColor(255*(1+normal_image)/2, cv2.COLOR_RGB2BGR)) normal_image_save = cv2.cvtColor(255*(1+normal_image)/2, cv2.COLOR_RGB2BGR) normal_image = cv2.resize(normal_image, use_size) rgb_image = transforms.ToTensor()(rgb_image).unsqueeze(dim=0) normal_image = transforms.ToTensor()(normal_image).unsqueeze(dim=0) with torch.no_grad(): pred = model.netRoadSeg(rgb_image, normal_image) palet_file = 'datasets/palette.txt' impalette = list(np.genfromtxt(palet_file, dtype=np.uint8).reshape(3*256)) pred_img = tensor2labelim(pred, impalette) pred_img = cv2.resize(pred_img, oriSize) prob_map = tensor2confidencemap(pred) prob_map = cv2.resize(prob_map, oriSize) cv2.imwrite(os.path.join(os.path.join(save_dir, img_name+'_pred.png')), pred_img) cv2.imwrite(os.path.join(os.path.join(save_dir, img_name+'_probmap.png')), prob_map) rgb_image_save = rgb_image_save.transpose(2,0,1) pred_img = pred_img.transpose(2,0,1) inds = prob_map>128 rgb_image_save[:,inds] = pred_img[:,inds] rgb_image_save = rgb_image_save.transpose(1,2,0) cv2.imwrite(os.path.join(os.path.join(save_dir, img_name+'_mask.png')), rgb_image_save) img_cat = np.concatenate((rgb_image_save_, normal_image_save, rgb_image_save),axis=1) img_cat = cv2.resize(img_cat, (int(img_cat.shape[1]*0.5),int(img_cat.shape[0]*0.5))) cv2.imwrite(os.path.join(os.path.join(root_dir, seq_name, img_name+'.png')), img_cat) #cv2.imshow('img_cat',img_cat) #cv2.waitKey(0) #cv2.destroyAllWindows() print('Done!') ================================================ FILE: examples/y0613_1242/calib/1623721491895.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: examples/y0613_1242/calib/1623721491991.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: examples/y0613_1242/calib/1623721492091.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: examples/y0613_1242/calib/1623721492191.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: examples/y0613_1242/calib/1623721492290.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: examples/y0613_1242/calib/1623721492790.txt ================================================ cam_K: 1487.752825 0.0 625.011716 0.0 1471.063437 376.483801 0.0 0.0 1.0 cam_RT: 0.9990242442017794 -0.03870134489718302 -0.02127828471028577 0.0019129833672195673 -0.025608957095939373 -0.1150890189011354 -0.9930250243799661 0.1408030241727829 0.03598250704885683 0.9926009891348704 -0.11596782120465826 0.08308956027030945 0.0 0.0 0.0 1.0 lidar_R: -0.996479 -0.0834525 0.00805512 0.0837999 -0.994374 0.0647862 0.00260323 0.0652331 0.997867 lidar_T: 0 0 0 ================================================ FILE: models/__init__.py ================================================ import importlib from models.base_model import BaseModel def find_model_using_name(model_name): # Given the option --model [modelname], # the file "models/modelname_model.py" # will be imported. model_filename = "models." + model_name + "_model" modellib = importlib.import_module(model_filename) # In the file, the class called ModelNameModel() will # be instantiated. It has to be a subclass of BaseModel, # and it is case-insensitive. model = None target_model_name = model_name.replace('_', '') + 'model' for name, cls in modellib.__dict__.items(): if name.lower() == target_model_name.lower() \ and issubclass(cls, BaseModel): model = cls if model is None: print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) exit(0) return model def get_option_setter(model_name): model_class = find_model_using_name(model_name) return model_class.modify_commandline_options def create_model(opt, dataset): model = find_model_using_name(opt.model) instance = model() instance.initialize(opt, dataset) print("model [%s] was created" % (instance.name())) return instance ================================================ FILE: models/base_model.py ================================================ import os import torch from collections import OrderedDict from . import loss class BaseModel(): # modify parser to add command line options, # and also change the default values if needed @staticmethod def modify_commandline_options(parser, is_train): return parser def name(self): return 'BaseModel' def initialize(self, opt): self.opt = opt self.gpu_ids = opt.gpu_ids self.isTrain = opt.isTrain self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) self.loss_names = [] self.model_names = [] self.visual_names = [] self.image_names = [] self.image_oriSize = [] def set_input(self, input): self.input = input def forward(self): pass # load and print networks; create schedulers def setup(self, opt, parser=None): if self.isTrain: self.schedulers = [loss.get_scheduler(optimizer, opt) for optimizer in self.optimizers] if not self.isTrain or opt.continue_train: self.load_networks(opt.epoch) self.print_networks(opt.verbose) # make models eval mode during test time def eval(self): for name in self.model_names: if isinstance(name, str): net = getattr(self, 'net' + name) net.eval() def train(self): for name in self.model_names: if isinstance(name, str): net = getattr(self, 'net' + name) net.train() # used in test time, wrapping `forward` in no_grad() so we don't save # intermediate steps for backprop def test(self): with torch.no_grad(): self.forward() # get image names def get_image_names(self): return self.image_names # get image original size def get_image_oriSize(self): return self.image_oriSize def optimize_parameters(self): pass # update learning rate (called once every epoch) def update_learning_rate(self): for scheduler in self.schedulers: scheduler.step() lr = self.optimizers[0].param_groups[0]['lr'] print('learning rate = %.7f' % lr) # return visualization images. train.py will display these images in tensorboardX def get_current_visuals(self): visual_ret = OrderedDict() for name in self.visual_names: if isinstance(name, str): visual_ret[name] = getattr(self, name) return visual_ret # return traning losses/errors. train.py will print out these errors as debugging information def get_current_losses(self): errors_ret = OrderedDict() for name in self.loss_names: if isinstance(name, str): # float(...) works for both scalar tensor and float number errors_ret[name] = float(getattr(self, 'loss_' + name)) return errors_ret # save models to the disk def save_networks(self, epoch): for name in self.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = os.path.join(self.save_dir, save_filename) net = getattr(self, 'net' + name) if len(self.gpu_ids) > 0 and torch.cuda.is_available(): torch.save(net.module.cpu().state_dict(), save_path) net.cuda(self.gpu_ids[0]) else: torch.save(net.cpu().state_dict(), save_path) def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0): key = keys[i] if i + 1 == len(keys): # at the end, pointing to a parameter/buffer if module.__class__.__name__.startswith('InstanceNorm') and \ (key == 'running_mean' or key == 'running_var'): if getattr(module, key) is None: state_dict.pop('.'.join(keys)) if module.__class__.__name__.startswith('InstanceNorm') and \ (key == 'num_batches_tracked'): state_dict.pop('.'.join(keys)) else: self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1) # load models from the disk def load_networks(self, epoch): for name in self.model_names: if isinstance(name, str): load_filename = '%s_net_%s.pth' % (epoch, name) load_path = os.path.join(self.save_dir, load_filename) net = getattr(self, 'net' + name) if isinstance(net, torch.nn.DataParallel): net = net.module print('loading the model from %s' % load_path) # if you are using PyTorch newer than 0.4 (e.g., built from # GitHub source), you can remove str() on self.device state_dict = torch.load(load_path, map_location=str(self.device)) if hasattr(state_dict, '_metadata'): del state_dict._metadata for key in list(state_dict.keys()): self.__patch_instance_norm_state_dict(state_dict, net, key.split('.')) net.load_state_dict(state_dict) # print network information def print_networks(self, verbose): print('---------- Networks initialized -------------') for name in self.model_names: if isinstance(name, str): net = getattr(self, 'net' + name) num_params = 0 for param in net.parameters(): num_params += param.numel() if verbose: print(net) print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6)) print('-----------------------------------------------') # set requies_grad=Fasle to avoid computation def set_requires_grad(self, nets, requires_grad=False): if not isinstance(nets, list): nets = [nets] for net in nets: if net is not None: for param in net.parameters(): param.requires_grad = requires_grad ================================================ FILE: models/loss.py ================================================ import torch import torch.nn as nn from torch.nn import init import torchvision from torch.optim import lr_scheduler import torch.nn.functional as F def get_scheduler(optimizer, opt): if opt.lr_policy == 'lambda': lambda_rule = lambda epoch: opt.lr_gamma ** ((epoch+1) // opt.lr_decay_epochs) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer,step_size=opt.lr_decay_iters, gamma=0.1) else: return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler class SegmantationLoss(nn.Module): def __init__(self, class_weights=None): super(SegmantationLoss, self).__init__() self.loss = nn.CrossEntropyLoss(weight=class_weights) def __call__(self, output, target, pixel_average=True): if pixel_average: return self.loss(output, target) #/ target.data.sum() else: return self.loss(output, target) ================================================ FILE: models/roadseg_model.py ================================================ import torch from .base_model import BaseModel from . import loss from .transformer_models.backbones import transformer import numpy as np import os class RoadSegModel(BaseModel): def name(self): return 'RoadSegModel' @staticmethod def modify_commandline_options(parser, is_train=True): # changing the default values if is_train: parser.add_argument('--lambda_L1', type=float, default=100.0, help='weight for L1 loss') return parser def initialize(self, opt, dataset): BaseModel.initialize(self, opt) self.isTrain = opt.isTrain # specify the training losses you want to print out. The program will call base_model.get_current_losses self.loss_names = ['segmentation'] # specify the images you want to save/display. The program will call base_model.get_current_visuals self.visual_names = ['rgb_image', 'another_image', 'label', 'output'] # specify the models you want to save to the disk. The program will call base_model.save_networks and base_model.load_networks self.model_names = ['RoadSeg'] # load/define networks # mix_transformer self.netRoadSeg = transformer.define_RoadSeg(dataset.num_labels, use_sne=opt.use_sne, init_type=opt.init_type, init_gain= opt.init_gain, gpu_ids= self.gpu_ids) # define loss functions self.criterionSegmentation = loss.SegmantationLoss(class_weights=None).to(self.device) if self.isTrain: # initialize optimizers self.optimizers = [] self.optimizer_RoadSeg = torch.optim.SGD(self.netRoadSeg.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) #self.optimizer_RoadSeg = torch.optim.Adam(self.netRoadSeg.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) self.optimizers.append(self.optimizer_RoadSeg) self.set_requires_grad(self.netRoadSeg, True) def set_input(self, input): self.rgb_image = input['rgb_image'].to(self.device) self.another_image = input['another_image'].to(self.device) self.label = input['label'].to(self.device) self.image_names = input['path'] self.image_oriSize = input['oriSize'] def forward(self): self.output = self.netRoadSeg(self.rgb_image, self.another_image) def get_loss(self): self.loss_segmentation = self.criterionSegmentation(self.output, self.label) def backward(self): self.loss_segmentation.backward() def optimize_parameters(self): self.forward() self.optimizer_RoadSeg.zero_grad() self.get_loss() self.backward() self.optimizer_RoadSeg.step() ================================================ FILE: models/sne_model.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class SNE(nn.Module): """SNE takes depth and camera intrinsic parameters as input, and outputs normal estimations. """ def __init__(self): super(SNE, self).__init__() def forward(self, depth, camParam): h,w = depth.size() v_map, u_map = torch.meshgrid(torch.arange(h), torch.arange(w)) v_map = v_map.type(torch.float32) u_map = u_map.type(torch.float32) Z = depth # h, w Y = Z.mul((v_map - camParam[1,2])) / camParam[0,0] # h, w X = Z.mul((u_map - camParam[0,2])) / camParam[0,0] # h, w # half part #Z[Y <= 0] = 0 #Y[Y <= 0] = 0 Z[torch.isnan(Z)] = 0 D = torch.div(torch.ones(h, w), Z) # h, w Gx = torch.tensor([[0,0,0],[-1,0,1],[0,0,0]], dtype=torch.float32) Gy = torch.tensor([[0,-1,0],[0,0,0],[0,1,0]], dtype=torch.float32) Gu = F.conv2d(D.view(1,1,h,w), Gx.view(1,1,3,3), padding=1) Gv = F.conv2d(D.view(1,1,h,w), Gy.view(1,1,3,3), padding=1) nx_t = Gu * camParam[0,0] # 1, 1, h, w ny_t = Gv * camParam[1,1] # 1, 1, h, w phi = torch.atan(torch.div(ny_t, nx_t)) + torch.ones([1,1,h,w])*3.141592657 a = torch.cos(phi) b = torch.sin(phi) diffKernelArray = torch.tensor([[-1, 0, 0, 0, 1, 0, 0, 0, 0], [ 0,-1, 0, 0, 1, 0, 0, 0, 0], [ 0, 0,-1, 0, 1, 0, 0, 0, 0], [ 0, 0, 0,-1, 1, 0, 0, 0, 0], [ 0, 0, 0, 0, 1,-1, 0, 0, 0], [ 0, 0, 0, 0, 1, 0,-1, 0, 0], [ 0, 0, 0, 0, 1, 0, 0,-1, 0], [ 0, 0, 0, 0, 1, 0, 0, 0,-1]], dtype=torch.float32) sum_nx = torch.zeros((1,1,h,w), dtype=torch.float32) sum_ny = torch.zeros((1,1,h,w), dtype=torch.float32) sum_nz = torch.zeros((1,1,h,w), dtype=torch.float32) for i in range(8): diffKernel = diffKernelArray[i].view(1,1,3,3) X_d = F.conv2d(X.view(1,1,h,w), diffKernel, padding=1) Y_d = F.conv2d(Y.view(1,1,h,w), diffKernel, padding=1) Z_d = F.conv2d(Z.view(1,1,h,w), diffKernel, padding=1) nz_i = torch.div((torch.mul(nx_t, X_d) + torch.mul(ny_t, Y_d)), Z_d) norm = torch.sqrt(torch.mul(nx_t, nx_t) + torch.mul(ny_t, ny_t) + torch.mul(nz_i, nz_i)) nx_t_i = torch.div(nx_t, norm) ny_t_i = torch.div(ny_t, norm) nz_t_i = torch.div(nz_i, norm) nx_t_i[torch.isnan(nx_t_i)] = 0 ny_t_i[torch.isnan(ny_t_i)] = 0 nz_t_i[torch.isnan(nz_t_i)] = 0 sum_nx = sum_nx + nx_t_i sum_ny = sum_ny + ny_t_i sum_nz = sum_nz + nz_t_i theta = -torch.atan(torch.div((torch.mul(sum_nx, a) + torch.mul(sum_ny, b)), sum_nz)) nx = torch.mul(torch.sin(theta), torch.cos(phi)) ny = torch.mul(torch.sin(theta), torch.sin(phi)) nz = torch.cos(theta) nx[torch.isnan(nz)] = 0 ny[torch.isnan(nz)] = 0 nz[torch.isnan(nz)] = -1 sign = torch.ones((1,1,h,w), dtype=torch.float32) sign[ny > 0] = -1 nx = torch.mul(nx, sign).squeeze(dim=0) ny = torch.mul(ny, sign).squeeze(dim=0) nz = torch.mul(nz, sign).squeeze(dim=0) return torch.cat([nx, ny, nz], dim=0) ================================================ FILE: models/transformer_models/__init__.py ================================================ ================================================ FILE: models/transformer_models/backbones/__init__.py ================================================ ================================================ FILE: models/transformer_models/backbones/transformer.py ================================================ # --------------------------------------------------------------- # Copyright (c) 2021, NVIDIA Corporation. All rights reserved. # # This work is licensed under the NVIDIA Source Code License # --------------------------------------------------------------- import torch import torch.nn as nn from torch.nn import init import torch.nn.functional as F from functools import partial from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from timm.models.registry import register_model from timm.models.vision_transformer import _cfg from mmseg.models.builder import BACKBONES from mmseg.utils import get_root_logger from mmcv.runner import load_checkpoint import math from ..decode_heads.head import Head class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.dwconv = DWConv(hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): x = self.fc1(x) x = self.dwconv(x, H, W) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1): super().__init__() assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." self.dim = dim self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 self.q = nn.Linear(dim, dim, bias=qkv_bias) self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.sr_ratio = sr_ratio if sr_ratio > 1: self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) self.norm = nn.LayerNorm(dim) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): B, N, C = x.shape q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) if self.sr_ratio > 1: x_ = x.permute(0, 2, 1).reshape(B, C, H, W) x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) x_ = self.norm(x_) kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) else: kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) k, v = kv[0], kv[1] attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x, H, W): x = x + self.drop_path(self.attn(self.norm1(x), H, W)) x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) return x class _PositionAttentionModule(nn.Module): """ Position attention module""" def __init__(self, in_channels, **kwargs): super(_PositionAttentionModule, self).__init__() self.conv_b = nn.Conv2d(in_channels, in_channels // 8, 1) self.conv_c = nn.Conv2d(in_channels, in_channels // 8, 1) self.conv_d = nn.Conv2d(in_channels, in_channels, 1) self.alpha = nn.Parameter(torch.zeros(1)) self.softmax = nn.Softmax(dim=-1) def forward(self, x): batch_size, _, height, width = x.size() feat_b = self.conv_b(x).view(batch_size, -1, height * width).permute(0, 2, 1) feat_c = self.conv_c(x).view(batch_size, -1, height * width) attention_s = self.softmax(torch.bmm(feat_b, feat_c)) feat_d = self.conv_d(x).view(batch_size, -1, height * width) feat_e = torch.bmm(feat_d, attention_s.permute(0, 2, 1)).view(batch_size, -1, height, width) out = self.alpha * feat_e + x return out class OverlapPatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) self.img_size = img_size self.patch_size = patch_size self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] self.num_patches = self.H * self.W self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=(patch_size[0] // 2, patch_size[1] // 2)) self.norm = nn.LayerNorm(embed_dim) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def forward(self, x): x = self.proj(x) _, _, H, W = x.shape x = x.flatten(2).transpose(1, 2) x = self.norm(x) return x, H, W def init_weights(net, init_type='normal', gain=0.02): net = net def init_func(m): classname = m.__class__.__name__ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): if init_type == 'normal': init.normal_(m.weight.data, 0.0, gain) elif init_type == 'xavier': init.xavier_normal_(m.weight.data, gain=gain) elif init_type == 'kaiming': init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') elif init_type == 'orthogonal': init.orthogonal_(m.weight.data, gain=gain) elif init_type == 'pretrained': pass else: raise NotImplementedError('initialization method [%s] is not implemented' % init_type) if hasattr(m, 'bias') and m.bias is not None and init_type != 'pretrained': init.constant_(m.bias.data, 0.0) elif classname.find('BatchNorm2d') != -1: init.normal_(m.weight.data, 1.0, gain) init.constant_(m.bias.data, 0.0) net.apply(init_func) def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]): if len(gpu_ids) > 0: assert(torch.cuda.is_available()) net.to(gpu_ids[0]) net = torch.nn.DataParallel(net, gpu_ids) for root_child in net.children(): for children in root_child.children(): init_weights(children, init_type, gain=init_gain) return net def define_RoadSeg(num_labels, use_sne=True, init_type='xavier', init_gain=0.02, gpu_ids=[]): net= mit_b2() # or other model mit_b1,...,mit_b5 return init_net(net, init_type, init_gain, gpu_ids) class crossAttentionModule(nn.Module): """Channel attention module""" def __init__(self, in_features, hidden_features): super(crossAttentionModule, self).__init__() self.fc = nn.Linear(in_features, hidden_features) self.sigmoid = nn.Sigmoid() def forward(self, x, y): cross_attention = self.sigmoid(self.fc(x+y)) return cross_attention class MixVisionTransformer(nn.Module): def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]): super().__init__() self.num_classes = num_classes self.depths = depths self.pam_1 = _PositionAttentionModule(embed_dims[0]) self.pam_2 = _PositionAttentionModule(embed_dims[1]) self.pam_3 = _PositionAttentionModule(embed_dims[2]) self.pam_4 = _PositionAttentionModule(embed_dims[3]) self.cam = crossAttentionModule(embed_dims[0],embed_dims[0]) # patch_embed self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans, embed_dim=embed_dims[0]) self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0], embed_dim=embed_dims[1]) self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1], embed_dim=embed_dims[2]) self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2], embed_dim=embed_dims[3]) # transformer encoder dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule cur = 0 self.block1 = nn.ModuleList([Block( dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[0]) for i in range(depths[0])]) self.norm1 = norm_layer(embed_dims[0]) cur += depths[0] self.block2 = nn.ModuleList([Block( dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[1]) for i in range(depths[1])]) self.norm2 = norm_layer(embed_dims[1]) cur += depths[1] self.block3 = nn.ModuleList([Block( dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[2]) for i in range(depths[2])]) self.norm3 = norm_layer(embed_dims[2]) cur += depths[2] self.block4 = nn.ModuleList([Block( dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[3]) for i in range(depths[3])]) self.norm4 = norm_layer(embed_dims[3]) self.head = Head() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) def reset_drop_path(self, drop_path_rate): dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))] cur = 0 for i in range(self.depths[0]): self.block1[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[0] for i in range(self.depths[1]): self.block2[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[1] for i in range(self.depths[2]): self.block3[i].drop_path.drop_prob = dpr[cur + i] cur += self.depths[2] for i in range(self.depths[3]): self.block4[i].drop_path.drop_prob = dpr[cur + i] def freeze_patch_emb(self): self.patch_embed1.requires_grad = False @torch.jit.ignore def no_weight_decay(self): return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x, y): B = x.shape[0] outs = [] # stage 1 x, H, W = self.patch_embed1(x) for i, blk in enumerate(self.block1): x = blk(x, H, W) x = self.norm1(x) y, H, W = self.patch_embed1(y) for i, blk in enumerate(self.block1): y = blk(y, H, W) y = self.norm1(y) cross_attention = self.cam(x, y) x = cross_attention*x + x y = (1-cross_attention)*y + y x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() y = y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 2 x, H, W = self.patch_embed2(x+y) for i, blk in enumerate(self.block2): x = blk(x, H, W) x = self.norm2(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 3 x, H, W = self.patch_embed3(x) for i, blk in enumerate(self.block3): x = blk(x, H, W) x = self.norm3(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) # stage 4 x, H, W = self.patch_embed4(x) for i, blk in enumerate(self.block4): x = blk(x, H, W) x = self.norm4(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) return outs def forward(self, x, y): x = self.forward_features(x, y) x = self.head(x) return x class DWConv(nn.Module): def __init__(self, dim=768): super(DWConv, self).__init__() self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) def forward(self, x, H, W): B, N, C = x.shape x = x.transpose(1, 2).view(B, C, H, W) x = self.dwconv(x) x = x.flatten(2).transpose(1, 2) return x class mit_b0(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b0, self).__init__( patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) class mit_b1(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b1, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) class mit_b2(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b2, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) class mit_b3(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b3, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) class mit_b4(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b4, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) class mit_b5(MixVisionTransformer): def __init__(self, **kwargs): super(mit_b5, self).__init__( patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1], drop_rate=0.0, drop_path_rate=0.1) ================================================ FILE: models/transformer_models/decode_heads/__init__.py ================================================ ================================================ FILE: models/transformer_models/decode_heads/decode_head.py ================================================ from abc import ABCMeta, abstractmethod import torch import torch.nn as nn from mmcv.cnn import normal_init from mmcv.runner import auto_fp16, force_fp32 from mmseg.core import build_pixel_sampler from mmseg.ops import resize norm_cfg = dict(type='BN', requires_grad=True) class BaseDecodeHead(nn.Module, metaclass=ABCMeta): def __init__(self, in_channels = [64, 128, 320, 512], channels = 128, *, num_classes = 2, dropout_ratio=0.1, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=dict(type='ReLU'), in_index=[0, 1, 2, 3], input_transform=None, decoder_params=dict(embed_dim=256), ignore_index=255, sampler=None, align_corners=False): super(BaseDecodeHead, self).__init__() self._init_inputs(in_channels, in_index, input_transform) self.channels = channels self.num_classes = num_classes self.dropout_ratio = dropout_ratio self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.in_index = in_index self.ignore_index = ignore_index self.align_corners = align_corners if sampler is not None: self.sampler = build_pixel_sampler(sampler, context=self) else: self.sampler = None self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) if dropout_ratio > 0: self.dropout = nn.Dropout2d(dropout_ratio) else: self.dropout = None self.fp16_enabled = False def extra_repr(self): """Extra repr.""" s = f'input_transform={self.input_transform}, ' \ f'ignore_index={self.ignore_index}, ' \ f'align_corners={self.align_corners}' return s def _init_inputs(self, in_channels, in_index, input_transform): """Check and initialize input transforms. The in_channels, in_index and input_transform must match. Specifically, when input_transform is None, only single feature map will be selected. So in_channels and in_index must be of type int. When input_transform Args: in_channels (int|Sequence[int]): Input channels. in_index (int|Sequence[int]): Input feature index. input_transform (str|None): Transformation type of input features. Options: 'resize_concat', 'multiple_select', None. 'resize_concat': Multiple feature maps will be resize to the same size as first one and than concat together. Usually used in FCN head of HRNet. 'multiple_select': Multiple feature maps will be bundle into a list and passed into decode head. None: Only one select feature map is allowed. """ if input_transform is not None: assert input_transform in ['resize_concat', 'multiple_select'] self.input_transform = input_transform self.in_index = in_index if input_transform is not None: assert isinstance(in_channels, (list, tuple)) assert isinstance(in_index, (list, tuple)) assert len(in_channels) == len(in_index) if input_transform == 'resize_concat': self.in_channels = sum(in_channels) else: self.in_channels = in_channels else: assert isinstance(in_channels, int) assert isinstance(in_index, int) self.in_channels = in_channels def init_weights(self): """Initialize weights of classification layer.""" normal_init(self.conv_seg, mean=0, std=0.01) def _transform_inputs(self, inputs): """Transform inputs for decoder. Args: inputs (list[Tensor]): List of multi-level img features. Returns: Tensor: The transformed inputs """ if self.input_transform == 'resize_concat': inputs = [inputs[i] for i in self.in_index] upsampled_inputs = [ resize( input=x, size=inputs[0].shape[2:], mode='bilinear', align_corners=self.align_corners) for x in inputs ] inputs = torch.cat(upsampled_inputs, dim=1) elif self.input_transform == 'multiple_select': inputs = [inputs[i] for i in self.in_index] else: inputs = inputs[self.in_index] return inputs @auto_fp16() @abstractmethod def forward(self, inputs): """Placeholder of forward function.""" pass def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): """Forward function for training. Args: inputs (list[Tensor]): List of multi-level img features. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. gt_semantic_seg (Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. train_cfg (dict): The training config. Returns: dict[str, Tensor]: a dictionary of loss components """ seg_logits = self.forward(inputs) losses = self.losses(seg_logits, gt_semantic_seg) return losses def forward_test(self, inputs, img_metas, test_cfg): """Forward function for testing. Args: inputs (list[Tensor]): List of multi-level img features. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. test_cfg (dict): The testing config. Returns: Tensor: Output segmentation map. """ return self.forward(inputs) def cls_seg(self, feat): """Classify each pixel.""" if self.dropout is not None: feat = self.dropout(feat) output = self.conv_seg(feat) return output ================================================ FILE: models/transformer_models/decode_heads/head.py ================================================ # --------------------------------------------------------------- # Copyright (c) 2021, NVIDIA Corporation. All rights reserved. # # This work is licensed under the NVIDIA Source Code License # --------------------------------------------------------------- import numpy as np import torch.nn as nn import torch from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule from collections import OrderedDict from mmseg.ops import resize, Upsample from .decode_head import BaseDecodeHead from mmseg.models.utils import * import attr from IPython import embed class MLP(nn.Module): """ Linear Embedding """ def __init__(self, input_dim=2048, embed_dim=768): super().__init__() self.proj = nn.Linear(input_dim, embed_dim) def forward(self, x): x = x.flatten(2).transpose(1, 2) x = self.proj(x) return x class Head(BaseDecodeHead): def __init__(self, feature_strides=[4, 8, 16, 32], **kwargs): super(Head, self).__init__(input_transform='multiple_select', **kwargs) assert len(feature_strides) == len(self.in_channels) assert min(feature_strides) == feature_strides[0] self.feature_strides = feature_strides c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels decoder_params = dict(embed_dim=256) embedding_dim = decoder_params['embed_dim'] self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim) self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim) self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim) self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim) self.linear_fuse = ConvModule( in_channels=embedding_dim*4, out_channels=embedding_dim, kernel_size=1, norm_cfg=dict(type='BN', requires_grad=True) ) self.linear_pred = nn.Conv2d(embedding_dim, self.num_classes, kernel_size=1) self.Upsample = Upsample(scale_factor=4) def forward(self, inputs): x = self._transform_inputs(inputs) c1, c2, c3, c4 = x n, _, h, w = c4.shape _c4 = self.linear_c4(c4).permute(0,2,1).reshape(n, -1, c4.shape[2], c4.shape[3]) _c4 = resize(_c4, size=c1.size()[2:],mode='bilinear',align_corners=False) _c3 = self.linear_c3(c3).permute(0,2,1).reshape(n, -1, c3.shape[2], c3.shape[3]) _c3 = resize(_c3, size=c1.size()[2:],mode='bilinear',align_corners=False) _c2 = self.linear_c2(c2).permute(0,2,1).reshape(n, -1, c2.shape[2], c2.shape[3]) _c2 = resize(_c2, size=c1.size()[2:],mode='bilinear',align_corners=False) _c1 = self.linear_c1(c1).permute(0,2,1).reshape(n, -1, c1.shape[2], c1.shape[3]) _c1 = resize(_c1, size=c1.size()[2:],mode='bilinear',align_corners=False) x = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1)) x = self.linear_pred(x) return x ================================================ FILE: models/transformer_models/utils/__init__.py ================================================ from .inverted_residual import InvertedResidual, InvertedResidualV3 from .make_divisible import make_divisible from .res_layer import ResLayer from .self_attention_block import SelfAttentionBlock from .up_conv_block import UpConvBlock __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 'UpConvBlock', 'InvertedResidualV3' ] ================================================ FILE: models/transformer_models/utils/drop.py ================================================ """ DropBlock, DropPath PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers. Papers: DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890) Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382) Code: DropBlock impl inspired by two Tensorflow impl that I liked: - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74 - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py Hacked together by / Copyright 2020 Ross Wightman """ import torch import torch.nn as nn import torch.nn.functional as F def drop_block_2d( x, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, batchwise: bool = False): """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf DropBlock with an experimental gaussian noise option. This layer has been tested on a few training runs with success, but needs further validation and possibly optimization for lower runtime impact. """ B, C, H, W = x.shape total_size = W * H clipped_block_size = min(block_size, min(W, H)) # seed_drop_rate, the gamma parameter gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / ( (W - block_size + 1) * (H - block_size + 1)) # Forces the block to be inside the feature map. w_i, h_i = torch.meshgrid(torch.arange(W).to(x.device), torch.arange(H).to(x.device)) valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \ ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2)) valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype) if batchwise: # one mask for whole batch, quite a bit faster uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) else: uniform_noise = torch.rand_like(x) block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype) block_mask = -F.max_pool2d( -block_mask, kernel_size=clipped_block_size, # block_size, stride=1, padding=clipped_block_size // 2) if with_noise: normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x) if inplace: x.mul_(block_mask).add_(normal_noise * (1 - block_mask)) else: x = x * block_mask + normal_noise * (1 - block_mask) else: normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype) if inplace: x.mul_(block_mask * normalize_scale) else: x = x * block_mask * normalize_scale return x def drop_block_fast_2d( x: torch.Tensor, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, batchwise: bool = False): """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid block mask at edges. """ B, C, H, W = x.shape total_size = W * H clipped_block_size = min(block_size, min(W, H)) gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / ( (W - block_size + 1) * (H - block_size + 1)) if batchwise: # one mask for whole batch, quite a bit faster block_mask = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) < gamma else: # mask per batch element block_mask = torch.rand_like(x) < gamma block_mask = F.max_pool2d( block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2) if with_noise: normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x) if inplace: x.mul_(1. - block_mask).add_(normal_noise * block_mask) else: x = x * (1. - block_mask) + normal_noise * block_mask else: block_mask = 1 - block_mask normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(dtype=x.dtype) if inplace: x.mul_(block_mask * normalize_scale) else: x = x * block_mask * normalize_scale return x class DropBlock2d(nn.Module): """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf """ def __init__(self, drop_prob=0.1, block_size=7, gamma_scale=1.0, with_noise=False, inplace=False, batchwise=False, fast=True): super(DropBlock2d, self).__init__() self.drop_prob = drop_prob self.gamma_scale = gamma_scale self.block_size = block_size self.with_noise = with_noise self.inplace = inplace self.batchwise = batchwise self.fast = fast # FIXME finish comparisons of fast vs not def forward(self, x): if not self.training or not self.drop_prob: return x if self.fast: return drop_block_fast_2d( x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise) else: return drop_block_2d( x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise) def drop_path(x, drop_prob: float = 0., training: bool = False): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument. """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) random_tensor.floor_() # binarize output = x.div(keep_prob) * random_tensor return output class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) ================================================ FILE: models/transformer_models/utils/inverted_residual.py ================================================ from mmcv.cnn import ConvModule from torch import nn as nn from torch.utils import checkpoint as cp from .se_layer import SELayer class InvertedResidual(nn.Module): """InvertedResidual block for MobileNetV2. Args: in_channels (int): The input channels of the InvertedResidual block. out_channels (int): The output channels of the InvertedResidual block. stride (int): Stride of the middle (first) 3x3 convolution. expand_ratio (int): Adjusts number of channels of the hidden layer in InvertedResidual by this amount. dilation (int): Dilation rate of depthwise conv. Default: 1 conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU6'). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, in_channels, out_channels, stride, expand_ratio, dilation=1, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU6'), with_cp=False): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2], f'stride must in [1, 2]. ' \ f'But received {stride}.' self.with_cp = with_cp self.use_res_connect = self.stride == 1 and in_channels == out_channels hidden_dim = int(round(in_channels * expand_ratio)) layers = [] if expand_ratio != 1: layers.append( ConvModule( in_channels=in_channels, out_channels=hidden_dim, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) layers.extend([ ConvModule( in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, groups=hidden_dim, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg), ConvModule( in_channels=hidden_dim, out_channels=out_channels, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) ]) self.conv = nn.Sequential(*layers) def forward(self, x): def _inner_forward(x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out class InvertedResidualV3(nn.Module): """Inverted Residual Block for MobileNetV3. Args: in_channels (int): The input channels of this Module. out_channels (int): The output channels of this Module. mid_channels (int): The input channels of the depthwise convolution. kernel_size (int): The kernal size of the depthwise convolution. Default: 3. stride (int): The stride of the depthwise convolution. Default: 1. se_cfg (dict): Config dict for se layer. Defaul: None, which means no se layer. with_expand_conv (bool): Use expand conv or not. If set False, mid_channels must be the same with in_channels. Default: True. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU'). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, in_channels, out_channels, mid_channels, kernel_size=3, stride=1, se_cfg=None, with_expand_conv=True, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), with_cp=False): super(InvertedResidualV3, self).__init__() self.with_res_shortcut = (stride == 1 and in_channels == out_channels) assert stride in [1, 2] self.with_cp = with_cp self.with_se = se_cfg is not None self.with_expand_conv = with_expand_conv if self.with_se: assert isinstance(se_cfg, dict) if not self.with_expand_conv: assert mid_channels == in_channels if self.with_expand_conv: self.expand_conv = ConvModule( in_channels=in_channels, out_channels=mid_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.depthwise_conv = ConvModule( in_channels=mid_channels, out_channels=mid_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=mid_channels, conv_cfg=dict( type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if self.with_se: self.se = SELayer(**se_cfg) self.linear_conv = ConvModule( in_channels=mid_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) def forward(self, x): def _inner_forward(x): out = x if self.with_expand_conv: out = self.expand_conv(out) out = self.depthwise_conv(out) if self.with_se: out = self.se(out) out = self.linear_conv(out) if self.with_res_shortcut: return x + out else: return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out ================================================ FILE: models/transformer_models/utils/make_divisible.py ================================================ def make_divisible(value, divisor, min_value=None, min_ratio=0.9): """Make divisible function. This function rounds the channel number to the nearest value that can be divisible by the divisor. It is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by divisor. It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa Args: value (int): The original channel number. divisor (int): The divisor to fully divide the channel number. min_value (int): The minimum value of the output channel. Default: None, means that the minimum value equal to the divisor. min_ratio (float): The minimum ratio of the rounded channel number to the original channel number. Default: 0.9. Returns: int: The modified output channel number. """ if min_value is None: min_value = divisor new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than (1-min_ratio). if new_value < min_ratio * value: new_value += divisor return new_value ================================================ FILE: models/transformer_models/utils/norm.py ================================================ import torch import math import warnings def _no_grad_trunc_normal_(tensor, mean, std, a, b): # Cut & paste from PyTorch official master until it's in a few official releases - RW # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf def norm_cdf(x): # Computes standard normal cumulative distribution function return (1. + math.erf(x / math.sqrt(2.))) / 2. if (mean < a - 2 * std) or (mean > b + 2 * std): warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2) with torch.no_grad(): # Values are generated by using a truncated uniform distribution and # then using the inverse CDF for the normal distribution. # Get upper and lower cdf values l = norm_cdf((a - mean) / std) u = norm_cdf((b - mean) / std) # Uniformly fill tensor with values from [l, u], then translate to # [2l-1, 2u-1]. tensor.uniform_(2 * l - 1, 2 * u - 1) # Use inverse cdf transform for normal distribution to get truncated # standard normal tensor.erfinv_() # Transform to proper mean, std tensor.mul_(std * math.sqrt(2.)) tensor.add_(mean) # Clamp to ensure it's in the proper range tensor.clamp_(min=a, max=b) return tensor def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): # type: (Tensor, float, float, float, float) -> Tensor r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for generating the random values works best when :math:`a \leq \text{mean} \leq b`. Args: tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution std: the standard deviation of the normal distribution a: the minimum cutoff value b: the maximum cutoff value Examples: >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w) """ return _no_grad_trunc_normal_(tensor, mean, std, a, b) ================================================ FILE: models/transformer_models/utils/res_layer.py ================================================ from mmcv.cnn import build_conv_layer, build_norm_layer from torch import nn as nn class ResLayer(nn.Sequential): """ResLayer to build ResNet style backbone. Args: block (nn.Module): block used to build ResLayer. inplanes (int): inplanes of block. planes (int): planes of block. num_blocks (int): number of blocks. stride (int): stride of the first block. Default: 1 avg_down (bool): Use AvgPool instead of stride conv when downsampling in the bottleneck. Default: False conv_cfg (dict): dictionary to construct and config conv layer. Default: None norm_cfg (dict): dictionary to construct and config norm layer. Default: dict(type='BN') multi_grid (int | None): Multi grid dilation rates of last stage. Default: None contract_dilation (bool): Whether contract first dilation of each layer Default: False """ def __init__(self, block, inplanes, planes, num_blocks, stride=1, dilation=1, avg_down=False, conv_cfg=None, norm_cfg=dict(type='BN'), multi_grid=None, contract_dilation=False, **kwargs): self.block = block downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = [] conv_stride = stride if avg_down: conv_stride = 1 downsample.append( nn.AvgPool2d( kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False)) downsample.extend([ build_conv_layer( conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=conv_stride, bias=False), build_norm_layer(norm_cfg, planes * block.expansion)[1] ]) downsample = nn.Sequential(*downsample) layers = [] if multi_grid is None: if dilation > 1 and contract_dilation: first_dilation = dilation // 2 else: first_dilation = dilation else: first_dilation = multi_grid[0] layers.append( block( inplanes=inplanes, planes=planes, stride=stride, dilation=first_dilation, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) inplanes = planes * block.expansion for i in range(1, num_blocks): layers.append( block( inplanes=inplanes, planes=planes, stride=1, dilation=dilation if multi_grid is None else multi_grid[i], conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) super(ResLayer, self).__init__(*layers) ================================================ FILE: models/transformer_models/utils/se_layer.py ================================================ import mmcv import torch.nn as nn from mmcv.cnn import ConvModule from .make_divisible import make_divisible class SELayer(nn.Module): """Squeeze-and-Excitation Module. Args: channels (int): The input (and output) channels of the SE layer. ratio (int): Squeeze ratio in SELayer, the intermediate channel will be ``int(channels/ratio)``. Default: 16. conv_cfg (None or dict): Config dict for convolution layer. Default: None, which means using conv2d. act_cfg (dict or Sequence[dict]): Config dict for activation layer. If act_cfg is a dict, two activation layers will be configurated by this dict. If act_cfg is a sequence of dicts, the first activation layer will be configurated by the first dict and the second activation layer will be configurated by the second dict. Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, divisor=6.0)). """ def __init__(self, channels, ratio=16, conv_cfg=None, act_cfg=(dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, divisor=6.0))): super(SELayer, self).__init__() if isinstance(act_cfg, dict): act_cfg = (act_cfg, act_cfg) assert len(act_cfg) == 2 assert mmcv.is_tuple_of(act_cfg, dict) self.global_avgpool = nn.AdaptiveAvgPool2d(1) self.conv1 = ConvModule( in_channels=channels, out_channels=make_divisible(channels // ratio, 8), kernel_size=1, stride=1, conv_cfg=conv_cfg, act_cfg=act_cfg[0]) self.conv2 = ConvModule( in_channels=make_divisible(channels // ratio, 8), out_channels=channels, kernel_size=1, stride=1, conv_cfg=conv_cfg, act_cfg=act_cfg[1]) def forward(self, x): out = self.global_avgpool(x) out = self.conv1(out) out = self.conv2(out) return x * out ================================================ FILE: models/transformer_models/utils/self_attention_block.py ================================================ import torch from mmcv.cnn import ConvModule, constant_init from torch import nn as nn from torch.nn import functional as F class SelfAttentionBlock(nn.Module): """General self-attention block/non-local block. Please refer to https://arxiv.org/abs/1706.03762 for details about key, query and value. Args: key_in_channels (int): Input channels of key feature. query_in_channels (int): Input channels of query feature. channels (int): Output channels of key/query transform. out_channels (int): Output channels. share_key_query (bool): Whether share projection weight between key and query projection. query_downsample (nn.Module): Query downsample module. key_downsample (nn.Module): Key downsample module. key_query_num_convs (int): Number of convs for key/query projection. value_num_convs (int): Number of convs for value projection. matmul_norm (bool): Whether normalize attention map with sqrt of channels with_out (bool): Whether use out projection. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict|None): Config of activation layers. """ def __init__(self, key_in_channels, query_in_channels, channels, out_channels, share_key_query, query_downsample, key_downsample, key_query_num_convs, value_out_num_convs, key_query_norm, value_out_norm, matmul_norm, with_out, conv_cfg, norm_cfg, act_cfg): super(SelfAttentionBlock, self).__init__() if share_key_query: assert key_in_channels == query_in_channels self.key_in_channels = key_in_channels self.query_in_channels = query_in_channels self.out_channels = out_channels self.channels = channels self.share_key_query = share_key_query self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.key_project = self.build_project( key_in_channels, channels, num_convs=key_query_num_convs, use_conv_module=key_query_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if share_key_query: self.query_project = self.key_project else: self.query_project = self.build_project( query_in_channels, channels, num_convs=key_query_num_convs, use_conv_module=key_query_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.value_project = self.build_project( key_in_channels, channels if with_out else out_channels, num_convs=value_out_num_convs, use_conv_module=value_out_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if with_out: self.out_project = self.build_project( channels, out_channels, num_convs=value_out_num_convs, use_conv_module=value_out_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) else: self.out_project = None self.query_downsample = query_downsample self.key_downsample = key_downsample self.matmul_norm = matmul_norm self.init_weights() def init_weights(self): """Initialize weight of later layer.""" if self.out_project is not None: if not isinstance(self.out_project, ConvModule): constant_init(self.out_project, 0) def build_project(self, in_channels, channels, num_convs, use_conv_module, conv_cfg, norm_cfg, act_cfg): """Build projection layer for key/query/value/out.""" if use_conv_module: convs = [ ConvModule( in_channels, channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) ] for _ in range(num_convs - 1): convs.append( ConvModule( channels, channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) else: convs = [nn.Conv2d(in_channels, channels, 1)] for _ in range(num_convs - 1): convs.append(nn.Conv2d(channels, channels, 1)) if len(convs) > 1: convs = nn.Sequential(*convs) else: convs = convs[0] return convs def forward(self, query_feats, key_feats): """Forward function.""" batch_size = query_feats.size(0) query = self.query_project(query_feats) if self.query_downsample is not None: query = self.query_downsample(query) query = query.reshape(*query.shape[:2], -1) query = query.permute(0, 2, 1).contiguous() key = self.key_project(key_feats) value = self.value_project(key_feats) if self.key_downsample is not None: key = self.key_downsample(key) value = self.key_downsample(value) key = key.reshape(*key.shape[:2], -1) value = value.reshape(*value.shape[:2], -1) value = value.permute(0, 2, 1).contiguous() sim_map = torch.matmul(query, key) if self.matmul_norm: sim_map = (self.channels**-.5) * sim_map sim_map = F.softmax(sim_map, dim=-1) context = torch.matmul(sim_map, value) context = context.permute(0, 2, 1).contiguous() context = context.reshape(batch_size, -1, *query_feats.shape[2:]) if self.out_project is not None: context = self.out_project(context) return context ================================================ FILE: models/transformer_models/utils/up_conv_block.py ================================================ import torch import torch.nn as nn from mmcv.cnn import ConvModule, build_upsample_layer class UpConvBlock(nn.Module): """Upsample convolution block in decoder for UNet. This upsample convolution block consists of one upsample module followed by one convolution block. The upsample module expands the high-level low-resolution feature map and the convolution block fuses the upsampled high-level low-resolution feature map and the low-level high-resolution feature map from encoder. Args: conv_block (nn.Sequential): Sequential of convolutional layers. in_channels (int): Number of input channels of the high-level skip_channels (int): Number of input channels of the low-level high-resolution feature map from encoder. out_channels (int): Number of output channels. num_convs (int): Number of convolutional layers in the conv_block. Default: 2. stride (int): Stride of convolutional layer in conv_block. Default: 1. dilation (int): Dilation rate of convolutional layer in conv_block. Default: 1. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. conv_cfg (dict | None): Config dict for convolution layer. Default: None. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). upsample_cfg (dict): The upsample config of the upsample module in decoder. Default: dict(type='InterpConv'). If the size of high-level feature map is the same as that of skip feature map (low-level feature map from encoder), it does not need upsample the high-level feature map and the upsample_cfg is None. dcn (bool): Use deformable convoluton in convolutional layer or not. Default: None. plugins (dict): plugins for convolutional layers. Default: None. """ def __init__(self, conv_block, in_channels, skip_channels, out_channels, num_convs=2, stride=1, dilation=1, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), dcn=None, plugins=None): super(UpConvBlock, self).__init__() assert dcn is None, 'Not implemented yet.' assert plugins is None, 'Not implemented yet.' self.conv_block = conv_block( in_channels=2 * skip_channels, out_channels=out_channels, num_convs=num_convs, stride=stride, dilation=dilation, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, dcn=None, plugins=None) if upsample_cfg is not None: self.upsample = build_upsample_layer( cfg=upsample_cfg, in_channels=in_channels, out_channels=skip_channels, with_cp=with_cp, norm_cfg=norm_cfg, act_cfg=act_cfg) else: self.upsample = ConvModule( in_channels, skip_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) def forward(self, skip, x): """Forward function.""" x = self.upsample(x) out = torch.cat([skip, x], dim=1) out = self.conv_block(out) return out ================================================ FILE: options/__init__.py ================================================ ================================================ FILE: options/base_options.py ================================================ import argparse import os from util import util import torch import models import data class BaseOptions(): def __init__(self): self.initialized = False def initialize(self, parser): parser.add_argument('--dataroot', required=True, help='path to images, should have training, validation and testing') parser.add_argument('--batch_size', type=int, default=2, help='input batch size') parser.add_argument('--useWidth', type=int, default=1280, help='scale images to this width 640') parser.add_argument('--useHeight', type=int, default=704, help='scale images to this height 352') parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0 0,1,2, 0,2. use -1 for CPU') parser.add_argument('--name', type=str, default='experiment_name', help='name of the experiment. It decides where to store samples and models') parser.add_argument('--use_sne', action='store_true', help='chooses if using sne') parser.add_argument('--dataset', type=str, default='ORFD', help='chooses which dataset to load.') parser.add_argument('--model', type=str, default='roadseg', help='chooses which model to use.') parser.add_argument('--epoch', type=str, default='latest', help='chooses which epoch to load') parser.add_argument('--num_threads', default=0, type=int, help='# default 8, threads for loading data') parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints', help='models are saved here') parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization') parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly') parser.add_argument('--init_type', type=str, default='kaiming', help='network initialization [normal|xavier|kaiming|orthogonal]') parser.add_argument('--init_gain', type=float, default=0.02, help='scaling factor for normal, xavier and orthogonal.') parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information') parser.add_argument('--seed', type=int, default=0, help='seed for random generators') self.initialized = True return parser def gather_options(self): # initialize parser with basic options if not self.initialized: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = self.initialize(parser) # get the basic options opt, _ = parser.parse_known_args() # modify model-related parser options model_name = opt.model model_option_setter = models.get_option_setter(model_name) parser = model_option_setter(parser, self.isTrain) opt, _ = parser.parse_known_args() # parse again with the new defaults # modify dataset-related parser options dataset_name = opt.dataset dataset_option_setter = data.get_option_setter(dataset_name) parser = dataset_option_setter(parser, self.isTrain) self.parser = parser return parser.parse_args() def print_options(self, opt): message = '' message += '----------------- Options ---------------\n' for k, v in sorted(vars(opt).items()): comment = '' default = self.parser.get_default(k) if v != default: comment = '\t[default: %s]' % str(default) message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment) message += '----------------- End -------------------' print(message) # save to the disk expr_dir = os.path.join(opt.checkpoints_dir, opt.name) util.mkdirs(expr_dir) file_name = os.path.join(expr_dir, 'opt.txt') with open(file_name, 'wt') as opt_file: opt_file.write(message) opt_file.write('\n') def parse(self): opt = self.gather_options() opt.isTrain = self.isTrain # train or test self.print_options(opt) # set gpu ids str_ids = opt.gpu_ids.split(',') opt.gpu_ids = [] for str_id in str_ids: id = int(str_id) if id >= 0: opt.gpu_ids.append(id) if len(opt.gpu_ids) > 0: torch.cuda.set_device(opt.gpu_ids[0]) self.opt = opt return self.opt ================================================ FILE: options/test_options.py ================================================ from .base_options import BaseOptions class TestOptions(BaseOptions): def initialize(self, parser): parser = BaseOptions.initialize(self, parser) parser.add_argument('--results_dir', type=str, default='./testresults/', help='saves results here.') parser.add_argument('--phase', type=str, default='test', help='train, val, test') parser.add_argument('--prob_map', action='store_true', help='chooses outputting prob maps or binary predictions') parser.add_argument('--no_label', action='store_true', help='chooses if we have gt labels in testing phase') self.isTrain = False return parser ================================================ FILE: options/train_options.py ================================================ from .base_options import BaseOptions class TrainOptions(BaseOptions): def initialize(self, parser): parser = BaseOptions.initialize(self, parser) parser.add_argument('--print_freq', type=int, default=10, help='frequency of showing training results on console') parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model') parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count') parser.add_argument('--phase', type=str, default='train', help='train, val, test') parser.add_argument('--nepoch', type=int, default=30, help='maximum epochs') parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate for optimizer') parser.add_argument('--momentum', type=float, default=0.9, help='momentum factor for SGD') parser.add_argument('--weight_decay', type=float, default=0.0005, help='momentum factor for optimizer') parser.add_argument('--lr_policy', type=str, default='lambda', help='learning rate policy: lambda|step|plateau|cosine') parser.add_argument('--lr_decay_iters', type=int, default=5000000, help='multiply by a gamma every lr_decay_iters iterations') parser.add_argument('--lr_decay_epochs', type=int, default=25, help='multiply by a gamma every lr_decay_epoch epochs') parser.add_argument('--lr_gamma', type=float, default=0.9, help='gamma factor for lr_scheduler') self.isTrain = True return parser ================================================ FILE: road_hesai40_process.py ================================================ from __future__ import absolute_import, division, print_function import os import copy import glob import numpy as np from collections import Counter from scipy import interpolate import skimage.transform import cv2 if not ("DISPLAY" in os.environ): import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from PIL import Image import mayavi.mlab import multiprocessing as mp cmap = plt.cm.jet cmap2 = plt.cm.nipy_spectral def show_velo(pointcloud): x = pointcloud[:, 0] # x position of point y = pointcloud[:, 1] # y position of point z = pointcloud[:, 2] # z position of point d = np.sqrt(x ** 2 + y ** 2) # Map Distance from sensor degr = np.degrees(np.arctan(z / d)) vals = 'height' if vals == "height": col = z else: col = d fig = mayavi.mlab.figure(bgcolor=(0, 0, 0), size=(640, 500)) mayavi.mlab.points3d(x, y, z, col, # Values used for Color mode="point", colormap='spectral', # 'bone', 'copper', 'gnuplot' # color=(0, 1, 0), # Used a fixed (r,g,b) instead figure=fig, ) mayavi.mlab.show() def depth_colorize(depth): depth = (depth - np.min(depth)) / (np.max(depth) - np.min(depth)) depth = 255 * cmap(depth)[:, :, :3] # H, W, C return depth.astype('uint8') def load_velodyne_points(filename): """Load 3D point cloud from KITTI file format (adapted from https://github.com/hunse/kitti) """ #points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4) #kitti points = np.fromfile(filename, dtype=np.float32).reshape(-1, 5)[:,:4] #jk hesai40 data points[:, 3] = 1.0 # homogeneous return points def read_calib_file(path): """Read KITTI calibration file (from https://github.com/hunse/kitti) """ float_chars = set("0123456789.e+- ") data = {} with open(path, 'r') as f: for line in f.readlines(): key, value = line.split(':', 1) value = value.strip() data[key] = value if float_chars.issuperset(value): # try to cast to float array try: data[key] = np.array(list(map(float, value.split(' ')))) except ValueError: # casting error: data[key] already eq. value, so pass pass return data def sub2ind(matrixSize, rowSub, colSub): """Convert row, col matrix subscripts to linear indices """ m, n = matrixSize return rowSub * (n-1) + colSub - 1 def generate_depth_map(velo_filename): """Generate a depth map from jk hesai40 data """ # set image shape im_shape = np.array([720,1280],dtype=np.int32) # load velodyne points velo = load_velodyne_points(velo_filename) #show_velo(velo) # camera parameter K = np.array([ 1472.919866, 0.000000, 614.779599, 0.000000, 1452.953534, 353.800982, 0.000000, 0.000000, 1.000000 ]).reshape(3,3) RT = np.array([ 9.9954895655531806e-01, -2.9636912774399400e-02, 4.8514791948404291e-03, 5.4418414831161499e-02, 2.6634465658627177e-03, -7.3426033150586920e-02, -9.9729710904432078e-01, -1.2367740273475647e-01, 2.9913032303096956e-02, 9.9686020637648665e-01, -7.3313978486113582e-02, -1.0199587792158127e-01, 0., 0., 0., 1. ]).reshape(4,4) ''' # the default RT of camera is camera to car, so inv is needed print('R before:',RT) RTC = np.linalg.inv(RT) print('R after:',RTC) ''' # velodyne parameter from velodyne to car R_velo = np.array([-0.996842,-0.0793231,0.00385075, 0.0794014,-0.994533,0.067813, -0.00154944,0.0679046,0.997691]).reshape(3,3) T_velo = np.array([0, 0, 0]).reshape(3,1) # project velodyne to car coordinate velo = R_velo @ velo[:,:3].T + T_velo velo = velo.T # remove all behind image plane (approximation) velo = velo[velo[:, 1] >= 0, :] #show_velo(velo) ''' One_array = np.ones(velo.shape[0]).reshape(velo.shape[0],1) velo = np.hstack((velo,One_array)) print('velo.shape{}'.format(velo.shape)) ''' # project the points to the camera # projection from car to image ''' # version 1: P_car2im = K @ RT[:3, :4] velo_pts_im = np.dot(P_car2im, velo.T).T ''' # version 2: R = RT[:3,:3] T = RT[:3,-1].reshape(3,1) velo_pts_cam = (R @ velo.T + T).T #show_velo(velo_pts_cam) velo_pts_im = (K @ velo_pts_cam.T).T #show_velo(velo_pts_im) #velo_pts_im = np.array((velo_pts_im[:,0],velo_pts_im[:,1],velo_pts_im[:,2])).T#*(-1) velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis] # check if in bounds # use minus 1 to get the exact same value as KITTI matlab code velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1 velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1 #velo_pts_im[:, 0] val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0) val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0]) velo_pts_im = velo_pts_im[val_inds, :] # project to image depth = np.zeros((im_shape[:2])) depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2]#*(-1) # find the duplicate points and choose the closest depth inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0]) dupe_inds = [item for item, count in Counter(inds).items() if count > 1] for dd in dupe_inds: pts = np.where(inds == dd)[0] x_loc = int(velo_pts_im[pts[0], 0]) y_loc = int(velo_pts_im[pts[0], 1]) depth[y_loc, x_loc] = velo_pts_im[pts, 2].min() depth[depth < 0] = 0 print('depth.max:{}, depth.min:{}:'.format(depth.max(),depth.min())) # interpolate the depth map to fill in holes depth_inter = lin_interp(im_shape, velo_pts_im) #depth_inter = lstsq_interp(im_shape, velo_pts_im,valid=False) ''' # transfor depth to pointclouds depth_nonzero = np.where(depth_inter>0) #depth_nonzero = np.where(depth>0) u = depth_nonzero[1] v = depth_nonzero[0] z = depth_inter[v,u] #print('depth_nonzero:',depth_nonzero) #print('u:',u) #print('v:',v) #print('z:',z) #uvz = np.vstack((u,v,z)) uvz = np.vstack((u,v,z)) velo_c = np.linalg.inv(K) @ uvz print('velo_c:',velo_c,velo_c.shape) velo_w = R.T @ (velo_c - T) print('velo_w:',velo_w,velo_w.shape) show_velo(velo_w.T) ''' return depth,depth_inter def lin_interp(shape, xyd): from scipy.interpolate import LinearNDInterpolator m, n = shape ij, d = xyd[:, 1::-1], xyd[:, 2] f = LinearNDInterpolator(ij, d, fill_value=0) J, I = np.meshgrid(np.arange(n), np.arange(m)) IJ = np.vstack([I.flatten(), J.flatten()]).T disparity = f(IJ).reshape(shape) return disparity def main(velo_filename): """ process hesai40 lidar to sparse depth image and dense image """ path1_list = glob.glob(velo_filename+'/*') for path2 in path1_list: lidar_path = os.path.join(path2,'lidar_data') rgb_path = os.path.join(path2,'image_data') sparse_depth_dir = os.path.join(path2,'sparse_depth') dense_depth_dir = os.path.join(path2,'dense_depth') if not os.path.exists(sparse_depth_dir): os.mkdir(sparse_depth_dir) if not os.path.exists(dense_depth_dir): os.mkdir(dense_depth_dir) # depth img name and dirs lidar_data_list = glob.glob(lidar_path+'/*') for velo_filename in lidar_data_list: print('velo_filename:',velo_filename) img_name = velo_filename.split('/')[-1].split('.')[0] sparse_depth_save_dir = os.path.join(sparse_depth_dir,img_name+'.png') dense_depth_save_dir = os.path.join(dense_depth_dir,img_name+'.png') rgb_img_dir = os.path.join(rgb_path,img_name+'.png') # TODO: prepare for multi process # lidar to depth sparse_depth_pred, dense_depth_pred = generate_depth_map(velo_filename) sparse_depth = copy.deepcopy(sparse_depth_pred) dense_depth = copy.deepcopy(dense_depth_pred) # save depth in unit16 format img = (sparse_depth * 256.0).astype('uint16') img_buffer = img.tobytes() imgsave = Image.new("I", img.T.shape) imgsave.frombytes(img_buffer, 'raw', "I;16") imgsave.save(sparse_depth_save_dir) img = (dense_depth * 256.0).astype('uint16') img_buffer = img.tobytes() imgsave = Image.new("I", img.T.shape) imgsave.frombytes(img_buffer, 'raw', "I;16") imgsave.save(dense_depth_save_dir) ''' # show depth sparse_depth = depth_colorize(sparse_depth) sparse_depth = cv2.cvtColor(sparse_depth, cv2.COLOR_RGB2BGR) dense_depth = depth_colorize(dense_depth) dense_depth = cv2.cvtColor(dense_depth, cv2.COLOR_RGB2BGR) rgb = cv2.imread(rgb_img_dir) # the correspoing rgb img depth = np.concatenate((rgb,dense_depth,sparse_depth),axis=1) depth = cv2.resize(depth, (0,0),fx=0.5, fy=0.5) #cv2.imshow('depth_gt',depth_new) cv2.imshow('depth',depth) # show image with sparse to check the project consistency rgb = np.transpose(rgb, (2, 0, 1)) ind = sparse_depth_pred>0 rgb[0][ind] = 0 #sparse_depth[ind] rgb[1][ind] = 0 #sparse_depth[ind] rgb[2][ind] = 0 #sparse_depth[ind] rgb = rgb.transpose(1,2,0) cv2.imshow('rgb+sparse depth',rgb) cv2.waitKey(1) ''' if __name__=='__main__': velo_filename = 'xxx' main(velo_filename) ================================================ FILE: scripts/demo.sh ================================================ python3 demo.py --dataroot examples --name ORFD --use_sne --epoch best ================================================ FILE: scripts/test.sh ================================================ CUDA_VISIBLE_DEVICES=0 python3 test.py --dataroot '/workspace/data/' --dataset ORFD --name ORFD --use_sne --prob_map --epoch best ================================================ FILE: scripts/train.sh ================================================ python3 train.py --dataroot '/workspace/data' --dataset ORFD --name ORFD --use_sne --batch_size 8 --gpu_ids 0,1,2,3 ================================================ FILE: test.py ================================================ import os from options.test_options import TestOptions from data import CreateDataLoader from models import create_model from util.util import confusion_matrix, getScores, save_images import torch import numpy as np import cv2 if __name__ == '__main__': opt = TestOptions().parse() opt.num_threads = 1 opt.batch_size = 1 opt.serial_batches = True # no shuffle opt.isTrain = False save_dir = os.path.join(opt.results_dir, opt.name, opt.phase + '_' + opt.epoch) if not os.path.exists(save_dir): os.makedirs(save_dir) data_loader = CreateDataLoader(opt) dataset = data_loader.load_data() model = create_model(opt, dataset.dataset) model.setup(opt) model.eval() test_loss_iter = [] epoch_iter = 0 conf_mat = np.zeros((dataset.dataset.num_labels, dataset.dataset.num_labels), dtype=np.float) with torch.no_grad(): for i, data in enumerate(dataset): model.set_input(data) model.forward() model.get_loss() epoch_iter += opt.batch_size gt = model.label.cpu().int().numpy() _, pred = torch.max(model.output.data.cpu(), 1) pred = pred.float().detach().int().numpy() save_images(save_dir, model.get_current_visuals(), model.get_image_names(), model.get_image_oriSize(), opt.prob_map) # Resize images to the original size for evaluation image_size = model.get_image_oriSize() oriSize = (image_size[0].item(), image_size[1].item()) gt = np.expand_dims(cv2.resize(np.squeeze(gt, axis=0), oriSize, interpolation=cv2.INTER_NEAREST), axis=0) pred = np.expand_dims(cv2.resize(np.squeeze(pred, axis=0), oriSize, interpolation=cv2.INTER_NEAREST), axis=0) conf_mat += confusion_matrix(gt, pred, dataset.dataset.num_labels) test_loss_iter.append(model.loss_segmentation) print('Epoch {0:}, iters: {1:}/{2:}, loss: {3:.3f} '.format(opt.epoch, epoch_iter, len(dataset) * opt.batch_size, test_loss_iter[-1]), end='\r') avg_test_loss = torch.mean(torch.stack(test_loss_iter)) print ('Epoch {0:} test loss: {1:.3f} '.format(opt.epoch, avg_test_loss)) globalacc, pre, recall, F_score, iou = getScores(conf_mat) print ('Epoch {0:} glob acc : {1:.3f}, pre : {2:.3f}, recall : {3:.3f}, F_score : {4:.3f}, IoU : {5:.3f}'.format(opt.epoch, globalacc, pre, recall, F_score, iou)) ================================================ FILE: train.py ================================================ import time from options.train_options import TrainOptions from data import CreateDataLoader from models import create_model from util.util import confusion_matrix, getScores, tensor2labelim, tensor2im, print_current_losses import numpy as np import random import torch import cv2 from tensorboardX import SummaryWriter if __name__ == '__main__': train_opt = TrainOptions().parse() np.random.seed(train_opt.seed) random.seed(train_opt.seed) torch.manual_seed(train_opt.seed) torch.cuda.manual_seed(train_opt.seed) train_data_loader = CreateDataLoader(train_opt) train_dataset = train_data_loader.load_data() train_dataset_size = len(train_data_loader) print('#training images = %d' % train_dataset_size) valid_opt = TrainOptions().parse() valid_opt.phase = 'val' valid_opt.batch_size = 1 valid_opt.num_threads = 1 valid_opt.serial_batches = True valid_opt.isTrain = False valid_data_loader = CreateDataLoader(valid_opt) valid_dataset = valid_data_loader.load_data() valid_dataset_size = len(valid_data_loader) print('#validation images = %d' % valid_dataset_size) writer = SummaryWriter() model = create_model(train_opt, train_dataset.dataset) model.setup(train_opt) total_steps = 0 tfcount = 0 F_score_max = 0 for epoch in range(train_opt.epoch_count, train_opt.nepoch + 1): ### Training on the training set ### model.train() epoch_start_time = time.time() iter_data_time = time.time() epoch_iter = 0 train_loss_iter = [] for i, data in enumerate(train_dataset): iter_start_time = time.time() if total_steps % train_opt.print_freq == 0: t_data = iter_start_time - iter_data_time total_steps += train_opt.batch_size epoch_iter += train_opt.batch_size model.set_input(data) model.optimize_parameters() if total_steps % train_opt.print_freq == 0: tfcount = tfcount + 1 losses = model.get_current_losses() train_loss_iter.append(losses["segmentation"]) t = (time.time() - iter_start_time) / train_opt.batch_size print_current_losses(epoch, epoch_iter, losses, t, t_data) # There are several whole_loss values shown in tensorboard in one epoch, # to help better see the optimization phase writer.add_scalar('train/whole_loss', losses["segmentation"], tfcount) iter_data_time = time.time() mean_loss = np.mean(train_loss_iter) # One average training loss value in tensorboard in one epoch writer.add_scalar('train/mean_loss', mean_loss, epoch) palet_file = 'datasets/palette.txt' impalette = list(np.genfromtxt(palet_file,dtype=np.uint8).reshape(3*256)) tempDict = model.get_current_visuals() rgb = tensor2im(tempDict['rgb_image']) if train_opt.use_sne: another = tensor2im((tempDict['another_image']+1)/2) # color normal images else: another = tensor2im(tempDict['another_image']) label = tensor2labelim(tempDict['label'], impalette) output = tensor2labelim(tempDict['output'], impalette) #image_numpy = np.concatenate((rgb, another, label, output), axis=1) image_numpy = np.concatenate((rgb, another), axis=1) image_numpy = image_numpy.astype(np.float64) / 255 writer.add_image('Epoch' + str(epoch), image_numpy, dataformats='HWC') # show training images in tensorboard print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, train_opt.nepoch, time.time() - epoch_start_time)) model.update_learning_rate() ### Evaluation on the validation set ### model.eval() valid_loss_iter = [] epoch_iter = 0 conf_mat = np.zeros((valid_dataset.dataset.num_labels, valid_dataset.dataset.num_labels), dtype=np.float) with torch.no_grad(): for i, data in enumerate(valid_dataset): model.set_input(data) model.forward() model.get_loss() epoch_iter += valid_opt.batch_size gt = model.label.cpu().int().numpy() _, pred = torch.max(model.output.data.cpu(), 1) pred = pred.float().detach().int().numpy() # Resize images to the original size for evaluation image_size = model.get_image_oriSize() oriSize = (image_size[0].item(), image_size[1].item()) gt = np.expand_dims(cv2.resize(np.squeeze(gt, axis=0), oriSize, interpolation=cv2.INTER_NEAREST), axis=0) pred = np.expand_dims(cv2.resize(np.squeeze(pred, axis=0), oriSize, interpolation=cv2.INTER_NEAREST), axis=0) conf_mat += confusion_matrix(gt, pred, valid_dataset.dataset.num_labels) losses = model.get_current_losses() valid_loss_iter.append(model.loss_segmentation) print('valid epoch {0:}, iters: {1:}/{2:} '.format(epoch, epoch_iter, len(valid_dataset) * valid_opt.batch_size), end='\r') avg_valid_loss = torch.mean(torch.stack(valid_loss_iter)) globalacc, pre, recall, F_score, iou = getScores(conf_mat) # Record performance on the validation set writer.add_scalar('valid/loss', avg_valid_loss, epoch) writer.add_scalar('valid/global_acc', globalacc, epoch) writer.add_scalar('valid/pre', pre, epoch) writer.add_scalar('valid/recall', recall, epoch) writer.add_scalar('valid/F_score', F_score, epoch) writer.add_scalar('valid/iou', iou, epoch) # Save the best model according to the F-score, and record corresponding epoch number in tensorboard if F_score > F_score_max: print('saving the model at the end of epoch %d, iters %d' % (epoch, total_steps)) model.save_networks('best') F_score_max = F_score writer.add_text('best model', str(epoch)) ================================================ FILE: util/__init__.py ================================================ ================================================ FILE: util/util.py ================================================ from __future__ import print_function import torch import numpy as np from PIL import Image import os import cv2 def save_images(save_dir, visuals, image_name, image_size, prob_map): """save images to disk""" image_name = image_name[0] oriSize = (image_size[0].item(), image_size[1].item()) palet_file = 'datasets/palette.txt' impalette = list(np.genfromtxt(palet_file, dtype=np.uint8).reshape(3*256)) for label, im_data in visuals.items(): if label == 'output': if prob_map: im = tensor2confidencemap(im_data) im = cv2.resize(im, oriSize) cv2.imwrite(os.path.join(save_dir, image_name), im) else: im = tensor2labelim(im_data, impalette) im = cv2.resize(im, oriSize) cv2.imwrite(os.path.join(save_dir, image_name), cv2.cvtColor(im, cv2.COLOR_RGB2BGR)) def tensor2im(input_image, imtype=np.uint8): """Converts a image Tensor into an image array (numpy)""" if isinstance(input_image, torch.Tensor): image_tensor = input_image.data else: return input_image image_numpy = image_tensor[0].cpu().float().numpy() if image_numpy.shape[0] == 1: image_numpy = np.tile(image_numpy, (3, 1, 1)) image_numpy = (np.transpose(image_numpy, (1, 2, 0)))* 255.0 return image_numpy.astype(imtype) def tensor2labelim(label_tensor, impalette, imtype=np.uint8): """Converts a label Tensor into an image array (numpy), we use a palette to color the label images""" if len(label_tensor.shape) == 4: _, label_tensor = torch.max(label_tensor.data.cpu(), 1) label_numpy = label_tensor[0].cpu().float().detach().numpy() label_image = Image.fromarray(label_numpy.astype(np.uint8)) label_image = label_image.convert("P") label_image.putpalette(impalette) label_image = label_image.convert("RGB") return np.array(label_image).astype(imtype) def tensor2confidencemap(label_tensor, imtype=np.uint8): """Converts a prediction Tensor into an image array (numpy)""" softmax_numpy = label_tensor[0].cpu().float().detach().numpy() softmax_numpy = np.exp(softmax_numpy) label_image = np.true_divide(softmax_numpy[1], softmax_numpy[0] + softmax_numpy[1]) label_image = np.floor(255 * (label_image - label_image.min()) / (label_image.max() - label_image.min())) return np.array(label_image).astype(imtype) def print_current_losses(epoch, i, losses, t, t_data): message = '(epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (epoch, i, t, t_data) for k, v in losses.items(): message += '%s: %.3f ' % (k, v) print(message) def mkdirs(paths): if isinstance(paths, list) and not isinstance(paths, str): for path in paths: mkdir(path) else: mkdir(paths) def mkdir(path): if not os.path.exists(path): os.makedirs(path) def confusion_matrix(x, y, n, ignore_label=None, mask=None): if mask is None: mask = np.ones_like(x) == 1 k = (x >= 0) & (y < n) & (x != ignore_label) & (mask.astype(np.bool)) return np.bincount(n * x[k].astype(int) + y[k], minlength=n**2).reshape(n, n) def getScores(conf_matrix): if conf_matrix.sum() == 0: return 0, 0, 0, 0, 0 with np.errstate(divide='ignore',invalid='ignore'): globalacc = np.diag(conf_matrix).sum() / np.float(conf_matrix.sum()) classpre = np.diag(conf_matrix) / conf_matrix.sum(0).astype(np.float) classrecall = np.diag(conf_matrix) / conf_matrix.sum(1).astype(np.float) IU = np.diag(conf_matrix) / (conf_matrix.sum(1) + conf_matrix.sum(0) - np.diag(conf_matrix)).astype(np.float) pre = classpre[1] recall = classrecall[1] iou = IU[1] F_score = 2*(recall*pre)/(recall+pre) return globalacc, pre, recall, F_score, iou