Repository: meteorshowers/StereoNet Branch: master Commit: a4010bd63fe8 Files: 86 Total size: 325.0 KB Directory structure: gitextract_bxzusud1/ ├── LICENSE ├── README.md ├── configs/ │ └── config_disp.py ├── data ├── disparity/ │ ├── __init__.py │ ├── csrc/ │ │ ├── BuildCostVolume.h │ │ ├── ROIAlign.h │ │ ├── ROIPool.h │ │ ├── SigmoidFocalLoss.h │ │ ├── cpu/ │ │ │ ├── ROIAlign_cpu.cpp │ │ │ ├── nms_cpu.cpp │ │ │ └── vision.h │ │ ├── cuda/ │ │ │ ├── BuildCostVolume_cuda.cu │ │ │ ├── ROIAlign_cuda.cu │ │ │ ├── ROIPool_cuda.cu │ │ │ ├── SigmoidFocalLoss_cuda.cu │ │ │ ├── nms.cu │ │ │ └── vision.h │ │ ├── nms.h │ │ └── vision.cpp │ ├── dataloader/ │ │ ├── DataStatistics.py │ │ ├── KITTILoader.py │ │ ├── KITTI_submission_loader.py │ │ ├── KITTI_submission_loader2012.py │ │ ├── KITTIloader2012.py │ │ ├── KITTIloader2015.py │ │ ├── SceneFlowLoader_demo.py │ │ ├── SecenFlowLoader.py │ │ ├── SecenFlowLoader1.py │ │ ├── SecenFlowLoaderfix.py │ │ ├── Testloader.py │ │ ├── __init__.py │ │ ├── listflowfile.py │ │ ├── listflowfilefix.py │ │ ├── preprocess.py │ │ └── readpfm.py │ ├── eval/ │ │ ├── __init__.py │ │ ├── kitti/ │ │ │ ├── README.md │ │ │ ├── compile.sh │ │ │ ├── eval.sh │ │ │ ├── eval_05.sh │ │ │ ├── evaluate_object_3d_offline │ │ │ ├── evaluate_object_3d_offline.cpp │ │ │ └── mail.h │ │ └── kitti-object-eval-python/ │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── eval.py │ │ ├── eval.sh │ │ ├── eval_dist.sh │ │ ├── evaluate.py │ │ ├── kitti_common.py │ │ └── rotate_iou.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── batch_norm.py │ │ ├── build_cost_volume.py │ │ ├── iou_loss.py │ │ ├── misc.py │ │ ├── nms.py │ │ ├── roi_align.py │ │ ├── roi_pool.py │ │ ├── scale.py │ │ ├── sigmoid_focal_loss.py │ │ └── smooth_l1_loss.py │ ├── models/ │ │ ├── ActiveStereoNet.py │ │ ├── __init__.py │ │ ├── stereonet.py │ │ ├── stereonet_disp.py │ │ └── submodule.py │ └── utils/ │ ├── __init__.py │ ├── logger.py │ ├── preprocess.py │ ├── readpfm.py │ ├── tensorboardx.py │ └── utils.py ├── preprocessing/ │ ├── generate_disp.py │ ├── generate_lidar.py │ └── kitti_util.py ├── requirement.txt ├── setup.py └── tools/ ├── env_utils/ │ ├── __init__.py │ ├── exp.py │ ├── logger.py │ └── utils.py └── train_net_disp.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Yilun Chen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================
X-StereoLab is an open source stereo matching and stereo 3D object detection toolbox based on PyTorch. ## News: We released the codebase v0.0.0. * matching and detection model result.
* GOOGLE HITNET model pytorch model will be released.
* GOOGLE HITNET model pytorch KITTI2015 submission: http://www.cvlibs.net/datasets/kitti/eval_scene_flow_detail.php?benchmark=stereo&result=226494ba5559e9f5f46bdbd681d1564fee78409e ranking 145 with 80GMAC ### Requirements All the codes are tested in the following environment: * Ubuntu 16.04 * Python 3.7 * PyTorch 1.1.0 or 1.2.0 or 1.3.0 * Torchvision 0.2.2 or 0.4.1 ### Installation (1) Clone this repository. ``` git clone git@github.com:meteorshowers/X-StereoLab.git && cd X-StereoLab ``` (2) Setup Python environment. ``` conda activate -n xstereolab pip install -r requirements.txt --user ## conda deactivate xstereolab ``` ### Data Preparation (1) Please download the KITTI dataset. ``` ln -s /path/to/KITTI_DATA_PATH ./data/kitti/ ln -s /path/to/OUTPUT_PATH ./outputs/ ``` ### Multi-GPU Training The training scripts support [multi-processing distributed training](https://github.com/pytorch/examples/tree/master/imagenet), which is much faster than the typical PyTorch DataParallel interface. ``` python3 tools/train_net_disp.py --cfg ./configs/config_xxx.py --savemodel ./outputs/MODEL_NAME -btrain 4 -d 0-3 --multiprocessing-distributed ``` The training models, configuration and logs will be saved in the model folder. To load some pretrained model, you can run ``` python3 tools/train_net_disp.py --cfg xxx/config.py --loadmodel ./outputs/MODEL_NAMEx --start_epoch xxx --savemodel ./outputs/MODEL_NAME -btrain 4 -d 0-3 --multiprocessing-distributed ``` If you want to continue training from some epochs, just set the cfg, loadmodel and start_epoch to the respective model path. Besides, you can start a tensorboard session by ``` tensorboard --logdir=./outputs/MODEL_NAME/tensorboard --port=6666 ``` and visualize your training process by accessing https://localhost:6666 on your browser. ### Inference and Evaluation on working ... ### stereo matching Performance and Model Zoo
Methods Epochs Train Mem (GB/Img) Test Mem (GB/Img) EPE D1-all Models
HITNET (kitti) 4200 2.43% GoogleDrive
HITNET (sceneflow) 200 0.65 GoogleDrive
stereonet (sceneflow) 20 1.10 GoogleDrive
ActiveStereoNet 10 GoogleDrive
SOS
### stereo 3D detection Performance and Model Zoo #### PLUME: Efficient 3D Object Detection from Stereo Images
Methods Epochs Train Mem (GB/Img) Test Mem (GB/Img) 3D BEV AP (Ours small plume) 3D BEV AP (Paper small plume)
PLUME 72.9 / 62.5 / 56.9 74.4 / 61.7 / 55.8
### Video Demo We provide a video demo for showing the result of X-StereoLab. Here we show the predicted disparity map of activastereonet.

### TODO List - [x] Multiprocessing GPU training - [x] TensorboardX - [x] Reduce training GPU memory usage - [x] eval and test code - [ ] Result visualization - [ ] Still in progress ### Citations If you find our work useful in your research, please consider citing: ``` @misc{XStereoLab2021, title={{X-StereoLab} stereo matching and stereo 3D object detection toolbox}, author={X-StereoLab Contributors}, howpublished = {\url{https://github.com/meteorshowers/X-StereoLab}}, year={2021} } * refercence[2] @article{tankovich2020hitnet, title={HITNet: Hierarchical Iterative Tile Refinement Network for Real-time Stereo Matching}, author={Tankovich, Vladimir and H{\"a}ne, Christian and Fanello, Sean and Zhang, Yinda and Izadi, Shahram and Bouaziz, Sofien}, journal={arXiv preprint arXiv:2007.12140}, year={2020} } * refercence[3] @inproceedings{tankovich2018sos, title={Sos: Stereo matching in o (1) with slanted support windows}, author={Tankovich, Vladimir and Schoenberg, Michael and Fanello, Sean Ryan and Kowdle, Adarsh and Rhemann, Christoph and Dzitsiuk, Maksym and Schmidt, Mirko and Valentin, Julien and Izadi, Shahram}, booktitle={2018 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, pages={6782--6789}, year={2018}, organization={IEEE} } ``` ## Others contributors
pic
vtankovich

GOOGLE

pic
Yan Wang

Waymo

### Acknowledgment * Thanks to SamehKhamis (NVIDIA) ### License The code is released under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License for NonCommercial use only. Any commercial use should get formal permission first. ### Contact If you have any questions or suggestions about this repo, please feel free to contact me (xuanyili.edu@gmail.com). Wechat:
pic
XUANYILI

================================================ FILE: configs/config_disp.py ================================================ import os import numpy as np from yacs.config import CfgNode as CN cfg = CN() cfg.cnt = 0 cfg.btrain = 4 #------------- disparity ---------------# cfg.model = 'stereonet' # ['stereonet', 'activestereonet', 'hitnet', 'sos'] cfg.maxdisp = 192 cfg.mindisp = 0 cfg.loss_disp = True #--------------volume--------------------------# cfg.PlaneSweepVolume = False cfg.DispVolume = True #------------- depth ---------------# #------------- detection ---------------# #-------------- debug ----------------# cfg.debug = False #-------------- Parameters -----------# #----------------- centerness --------------# #----------------------------------------------------# ================================================ FILE: data ================================================ /media/elonli/049150C23EB4F058/DSGN/data ================================================ FILE: disparity/__init__.py ================================================ ================================================ FILE: disparity/csrc/BuildCostVolume.h ================================================ #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor BuildCostVolume_forward(const at::Tensor& left, const at::Tensor& right, const at::Tensor& shift) { if (left.type().is_cuda()) { #ifdef WITH_CUDA return BuildCostVolume_forward_cuda(left, right, shift); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::tuple BuildCostVolume_backward(const at::Tensor& grad, const at::Tensor& shift) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return BuildCostVolume_backward_cuda(grad, shift); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: disparity/csrc/ROIAlign.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor ROIAlign_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); } at::Tensor ROIAlign_backward(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: disparity/csrc/ROIPool.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif std::tuple ROIPool_forward(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor ROIPool_backward(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { if (grad.type().is_cuda()) { #ifdef WITH_CUDA return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: disparity/csrc/SigmoidFocalLoss.h ================================================ #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif // Interface for Python at::Tensor SigmoidFocalLoss_forward( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } at::Tensor SigmoidFocalLoss_backward( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { if (logits.type().is_cuda()) { #ifdef WITH_CUDA return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: disparity/csrc/cpu/ROIAlign_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" // implementation taken from Caffe2 template struct PreCalc { int pos1; int pos2; int pos3; int pos4; T w1; T w2; T w3; T w4; }; template void pre_calc_for_bilinear_interpolate( const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T x = xx; T y = yy; // deal with: inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty PreCalc pc; pc.pos1 = 0; pc.pos2 = 0; pc.pos3 = 0; pc.pos4 = 0; pc.w1 = 0; pc.w2 = 0; pc.w3 = 0; pc.w4 = 0; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; continue; } if (y <= 0) { y = 0; } if (x <= 0) { x = 0; } int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; // save weights and indeces PreCalc pc; pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; pc.pos3 = y_high * width + x_low; pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; pc.w4 = w4; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; } } } } } template void ROIAlignForward_cpu_kernel( const int nthreads, const T* bottom_data, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, //int roi_cols, T* top_data) { //AT_ASSERT(roi_cols == 4 || roi_cols == 5); int roi_cols = 5; int n_rois = nthreads / channels / pooled_width / pooled_height; // (n, c, ph, pw) is an element in the pooled output // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; // roi could have 4 or 5 columns const T* offset_bottom_rois = bottom_rois + n * roi_cols; int roi_batch_ind = 0; if (roi_cols == 5) { roi_batch_ind = offset_bottom_rois[0]; offset_bottom_rois++; } // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[0] * spatial_scale; T roi_start_h = offset_bottom_rois[1] * spatial_scale; T roi_end_w = offset_bottom_rois[2] * spatial_scale; T roi_end_h = offset_bottom_rois[3] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 // we want to precalculate indeces and weights shared by all chanels, // this is the key point of optimiation std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { int index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } } output_val /= count; top_data[index] = output_val; } // for pw } // for ph } // for c } // for n } at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { ROIAlignForward_cpu_kernel( output_size, input.data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.data(), output.data()); }); return output; } ================================================ FILE: disparity/csrc/cpu/nms_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "cpu/vision.h" template at::Tensor nms_cpu_kernel(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); } auto x1_t = dets.select(1, 0).contiguous(); auto y1_t = dets.select(1, 1).contiguous(); auto x2_t = dets.select(1, 2).contiguous(); auto y2_t = dets.select(1, 3).contiguous(); at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto ndets = dets.size(0); at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); auto suppressed = suppressed_t.data(); auto order = order_t.data(); auto x1 = x1_t.data(); auto y1 = y1_t.data(); auto x2 = x2_t.data(); auto y2 = y2_t.data(); auto areas = areas_t.data(); for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; if (suppressed[i] == 1) continue; auto ix1 = x1[i]; auto iy1 = y1[i]; auto ix2 = x2[i]; auto iy2 = y2[i]; auto iarea = areas[i]; for (int64_t _j = _i + 1; _j < ndets; _j++) { auto j = order[_j]; if (suppressed[j] == 1) continue; auto xx1 = std::max(ix1, x1[j]); auto yy1 = std::max(iy1, y1[j]); auto xx2 = std::min(ix2, x2[j]); auto yy2 = std::min(iy2, y2[j]); auto w = std::max(static_cast(0), xx2 - xx1 + 1); auto h = std::max(static_cast(0), yy2 - yy1 + 1); auto inter = w * h; auto ovr = inter / (iarea + areas[j] - inter); if (ovr >= threshold) suppressed[j] = 1; } } return at::nonzero(suppressed_t == 0).squeeze(1); } at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { at::Tensor result; AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { result = nms_cpu_kernel(dets, scores, threshold); }); return result; } ================================================ FILE: disparity/csrc/cpu/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold); ================================================ FILE: disparity/csrc/cuda/BuildCostVolume_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __device__ T bilinear_interpolate(const T* bottom_data, const int height, const int width, T y, T x) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty return 0; } if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int) y; int x_low = (int) x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = bottom_data[y_low * width + x_low]; T v2 = bottom_data[y_low * width + x_high]; T v3 = bottom_data[y_high * width + x_low]; T v4 = bottom_data[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __global__ void BuildCostVolumeForward(const int nthreads, const T* left, const T* right, const T* shift, const int num_batch, const int channels, const int height, const int width, const int max_disp, T* cost) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int pw = index % width; int ph = (index / width) % height; int pd = (index / width / height) % max_disp; int c = (index / width / height/ max_disp) % channels; int n = index / width / height / max_disp / channels; int index_L = (((n * 2 * channels + c) * max_disp + pd) * height + ph) * width + pw; int index_R = index_L + channels * max_disp * height * width; T shift_pd = -shift[n * max_disp + pd]; cost[index_L] = left[((n * channels + c) * height + ph) * width + pw]; if (pw + shift_pd >= 0. && pw + shift_pd <= width - 1) { const T* offset_right = right + (n * channels + c) * height * width; cost[index_R] = bilinear_interpolate(offset_right, height, width, (T)ph, (T)pw + shift_pd); } else { cost[index_R] = 0.; } } } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T & w1, T & w2, T & w3, T & w4, int & x_low, int & x_high, int & y_low, int & y_high) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int) y; x_low = (int) x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = bottom_data[y_low * width + x_low]; // T v2 = bottom_data[y_low * width + x_high]; // T v3 = bottom_data[y_high * width + x_low]; // T v4 = bottom_data[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template __global__ void BuildCostVolumeBackwardFeature(const int nthreads, const T* grad, const T* shift, const int num_batch, const int channels, const int height, const int width, const int max_disp, T* grad_left, T* grad_right) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int pw = index % width; int ph = (index / width) % height; int pd = (index / width / height) % max_disp; int c = (index / width / height/ max_disp) % channels; int n = index / width / height / max_disp / channels; int index_L = (((n * 2 * channels + c) * max_disp + pd) * height + ph) * width + pw; int index_R = index_L + channels * max_disp * height * width; T shift_pd = -shift[n * max_disp + pd]; // left atomicAdd(grad_left + ((n * channels + c) * height + ph) * width + pw, static_cast(grad[index_L])); if (pw + shift_pd >= 0. && pw + shift_pd <= width - 1) { // right T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, (T) ph, (T) pw + shift_pd, w1, w2, w3, w4, x_low, x_high, y_low, y_high); T top_diff_this_bin = grad[index_R]; T g1 = top_diff_this_bin * w1; T g2 = top_diff_this_bin * w2; T g3 = top_diff_this_bin * w3; T g4 = top_diff_this_bin * w4; T* offset_grad_right = grad_right + (n * channels + c) * height * width; if (w1 >= 1e-10) atomicAdd(offset_grad_right + y_low * width + x_low, static_cast(g1)); if (w2 >= 1e-10) atomicAdd(offset_grad_right + y_low * width + x_high, static_cast(g2)); if (w3 >= 1e-10) atomicAdd(offset_grad_right + y_high * width + x_low, static_cast(g3)); if (w4 >= 1e-10) atomicAdd(offset_grad_right + y_high * width + x_high, static_cast(g4)); } } // CUDA_1D_KERNEL_LOOP } // BuildCostVolumeBackward at::Tensor BuildCostVolume_forward_cuda(const at::Tensor& left, const at::Tensor& right, const at::Tensor& shift) { AT_ASSERTM(left.type().is_cuda(), "left must be a CUDA tensor"); AT_ASSERTM(right.type().is_cuda(), "right must be a CUDA tensor"); AT_ASSERTM(shift.type().is_cuda(), "shift must be a CUDA tensor"); AT_ASSERTM((left.size(0) == right.size(0)) && (left.size(1) == right.size(1)) && \ (left.size(2) == right.size(2)) && (left.size(3) == right.size(3)), \ "Left image and right image should match their size."); AT_ASSERTM(left.size(0) == shift.size(0), \ "Image and shift should of same batch."); auto num_batch = left.size(0); auto channels = left.size(1); auto height = left.size(2); auto width = left.size(3); auto max_disp = shift.size(1); auto output = at::empty({num_batch, channels * 2, max_disp, height, width}, left.options()); auto output_size = num_batch * channels * 2 * max_disp * height * width; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)(output_size / 2), 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return output; } AT_DISPATCH_FLOATING_TYPES(left.type(), "BuildCostVolume_forward", [&] { BuildCostVolumeForward<<>>( output_size / 2, left.contiguous().data(), right.contiguous().data(), shift.contiguous().data(), num_batch, channels, height, width, max_disp, output.data()); }); THCudaCheck(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory std::tuple BuildCostVolume_backward_cuda(const at::Tensor& grad, const at::Tensor& shift) { AT_ASSERTM(shift.type().is_cuda(), "shift must be a CUDA tensor"); auto num_batch = grad.size(0); auto channels = grad.size(1) / 2; auto height = grad.size(3); auto width = grad.size(4); auto max_disp = shift.size(1); auto grad_left = at::zeros({num_batch, channels, height, width}, grad.options()); auto grad_right = at::zeros({num_batch, channels, height, width}, grad.options()); AT_ASSERTM(grad.numel() == num_batch * channels * 2 * max_disp * height * width, "grad shape is wrong"); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(grad_left, grad_right); } AT_DISPATCH_FLOATING_TYPES(grad.type(), "BuildCostVolume_backward", [&] { BuildCostVolumeBackwardFeature<<>>( grad.numel() / 2, grad.contiguous().data(), shift.contiguous().data(), num_batch, channels, height, width, max_disp, grad_left.data(), grad_right.data()); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(grad_left, grad_right); } ================================================ FILE: disparity/csrc/cuda/ROIAlign_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __device__ T bilinear_interpolate(const T* bottom_data, const int height, const int width, T y, T x, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty return 0; } if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int) y; int x_low = (int) x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = bottom_data[y_low * width + x_low]; T v2 = bottom_data[y_low * width + x_high]; T v3 = bottom_data[y_high * width + x_low]; T v4 = bottom_data[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __global__ void RoIAlignForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T & w1, T & w2, T & w3, T & w4, int & x_low, int & x_high, int & y_low, int & y_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { //empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int) y; x_low = (int) x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T) y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T) x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = bottom_data[y_low * width + x_low]; // T v2 = bottom_data[y_low * width + x_high]; // T v3 = bottom_data[y_high * width + x_low]; // T v4 = bottom_data[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[1] * spatial_scale; T roi_start_h = offset_bottom_rois[2] * spatial_scale; T roi_end_w = offset_bottom_rois[3] * spatial_scale; T roi_end_h = offset_bottom_rois[4] * spatial_scale; // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); T roi_height = max(roi_end_h - roi_start_h, (T)1.); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix ++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RoIAlignBackward at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { RoIAlignForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois.contiguous().data(), output.data()); }); THCudaCheck(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { RoIAlignBackwardFeature<<>>( grad.numel(), grad.contiguous().data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: disparity/csrc/cuda/ROIPool_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const T* bottom_rois, T* top_data, int* argmax_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero T maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd int maxidx = -1; const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; if (offset_bottom_data[bottom_index] > maxval) { maxval = offset_bottom_data[bottom_index]; maxidx = bottom_index; } } } top_data[index] = maxval; argmax_data[index] = maxidx; } } template __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, const int* argmax_data, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int bottom_offset = (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; T* offset_bottom_diff = bottom_diff + bottom_offset; const int* offset_argmax_data = argmax_data + top_offset; int argmax = offset_argmax_data[ph * pooled_width + pw]; if (argmax != -1) { atomicAdd( offset_bottom_diff + argmax, static_cast(offset_top_diff[ph * pooled_width + pw])); } } } std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { RoIPoolFForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, width, pooled_height, pooled_width, rois.contiguous().data(), output.data(), argmax.data()); }); THCudaCheck(cudaGetLastError()); return std::make_tuple(output, argmax); } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); // TODO add more checks auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { RoIPoolFBackward<<>>( grad.numel(), grad.contiguous().data(), argmax.data(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: disparity/csrc/cuda/SigmoidFocalLoss_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu // Cheng-Yang Fu // cyfu@cs.unc.edu #include #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __global__ void SigmoidFocalLossForward(const int nthreads, const T* logits, const int* targets, const int num_classes, const float gamma, const float alpha, const int num, T* losses) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80]; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**gamma * log(p) where T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); // p**gamma * log(1-p) T term2 = powf(p, gamma) * (-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); losses[i] = 0.0; losses[i] += -c1 * term1 * zp; losses[i] += -c2 * term2 * zn; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossForward template __global__ void SigmoidFocalLossBackward(const int nthreads, const T* logits, const int* targets, const T* d_losses, const int num_classes, const float gamma, const float alpha, const int num, T* d_logits) { CUDA_1D_KERNEL_LOOP(i, nthreads) { int n = i / num_classes; int d = i % num_classes; // current class[0~79]; int t = targets[n]; // target class [1~80], 0 is background; // Decide it is positive or negative case. T c1 = (t == (d+1)); T c2 = (t>=0 & t != (d+1)); T zn = (1.0 - alpha); T zp = (alpha); // p = 1. / 1. + expf(-x); p = sigmoid(x) T p = 1. / (1. + expf(-logits[i])); // (1-p)**g * (1 - p - g*p*log(p) T term1 = powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); // (p**g) * (g*(1-p)*log(1-p) - p) T term2 = powf(p, gamma) * ((-1. * logits[i] * (logits[i] >= 0) - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * (1. - p) * gamma - p); d_logits[i] = 0.0; d_logits[i] += -c1 * term1 * zp; d_logits[i] += -c2 * term2 * zn; d_logits[i] = d_logits[i] * d_losses[i]; } // CUDA_1D_KERNEL_LOOP } // SigmoidFocalLossBackward at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); auto losses_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L)); dim3 block(512); if (losses.numel() == 0) { THCudaCheck(cudaGetLastError()); return losses; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<<>>( losses_size, logits.contiguous().data(), targets.contiguous().data(), num_classes, gamma, alpha, num_samples, losses.data()); }); THCudaCheck(cudaGetLastError()); return losses; } at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha) { AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L)); dim3 block(512); if (d_logits.numel() == 0) { THCudaCheck(cudaGetLastError()); return d_logits; } AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<<>>( d_logits_size, logits.contiguous().data(), targets.contiguous().data(), d_losses.contiguous().data(), num_classes, gamma, alpha, num_samples, d_logits.data()); }); THCudaCheck(cudaGetLastError()); return d_logits; } ================================================ FILE: disparity/csrc/cuda/nms.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include #include int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return std::get<0>(order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type()) }).sort(0, false)); } ================================================ FILE: disparity/csrc/cuda/vision.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include at::Tensor SigmoidFocalLoss_forward_cuda( const at::Tensor& logits, const at::Tensor& targets, const int num_classes, const float gamma, const float alpha); at::Tensor SigmoidFocalLoss_backward_cuda( const at::Tensor& logits, const at::Tensor& targets, const at::Tensor& d_losses, const int num_classes, const float gamma, const float alpha); at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio); at::Tensor ROIDisp_forward_cuda(const at::Tensor& input, const at::Tensor& input_R, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int max_disp); std::tuple ROIDisp_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int max_disp); std::tuple ROIPool_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width); at::Tensor BuildCostVolume_forward_cuda(const at::Tensor& left, const at::Tensor& right, const at::Tensor& shift); at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, const at::Tensor& input, const at::Tensor& rois, const at::Tensor& argmax, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width); std::tuple BuildCostVolume_backward_cuda(const at::Tensor& grad, const at::Tensor& left); at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); at::Tensor compute_flow_cuda(const at::Tensor& boxes, const int height, const int width); ================================================ FILE: disparity/csrc/nms.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #pragma once #include "cpu/vision.h" #ifdef WITH_CUDA #include "cuda/vision.h" #endif at::Tensor nms(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { if (dets.type().is_cuda()) { #ifdef WITH_CUDA // TODO raise error if not compiled with CUDA if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); auto b = at::cat({dets, scores.unsqueeze(1)}, 1); return nms_cuda(b, threshold); #else AT_ERROR("Not compiled with GPU support"); #endif } at::Tensor result = nms_cpu(dets, scores, threshold); return result; } ================================================ FILE: disparity/csrc/vision.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include "nms.h" #include "ROIAlign.h" #include "ROIPool.h" #include "SigmoidFocalLoss.h" #include "BuildCostVolume.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); m.def("build_cost_volume_forward", &BuildCostVolume_forward, "BuildCostVolume_forward"); m.def("build_cost_volume_backward", &BuildCostVolume_backward, "BuildCostVolume_backward"); } ================================================ FILE: disparity/dataloader/DataStatistics.py ================================================ import torch from dataloader import listflowfile as lt from dataloader import SecenFlowLoader as DA ================================================ FILE: disparity/dataloader/KITTILoader.py ================================================ import os import torch import torch.utils.data as data import torch import torchvision.transforms as transforms import random from PIL import Image, ImageOps import numpy as np from ..utils import preprocess IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): return Image.open(path).convert('RGB') def npy_loader(path): return np.load(path) def disparity_loader(path): return Image.open(path) class myImageFloder(data.Dataset): def __init__(self, left, right, left_disparity, left_norm, training, loader=default_loader, dploader=disparity_loader): self.left = left self.right = right self.disp_L = left_disparity self.norm_L = left_norm self.loader = loader self.dploader = dploader self.npy_loader = npy_loader self.training = training def __getitem__(self, index): left = self.left[index] right = self.right[index] disp_L = self.disp_L[index] norm_L = self.norm_L[index] left_img = self.loader(left) right_img = self.loader(right) dataL = self.dploader(disp_L) normL = self.npy_loader(norm_L[:-3]+'npy') if self.training: w, h = left_img.size # th, tw = 320, 1152 # th, tw = 256, 1152 # th, tw = 311, 1178 th, tw = 320, 1152 # th, tw = 256, 512 x1 = random.randint(0, w - tw) y1 = random.randint(0, h - th) left_img = left_img.crop((x1, y1, x1 + tw, y1 + th)) right_img = right_img.crop((x1, y1, x1 + tw, y1 + th)) dataL = np.ascontiguousarray(dataL, dtype=np.float32) / 256 dataL = dataL[y1:y1 + th, x1:x1 + tw] normL = normL[y1:y1 + th, x1:x1 + tw, :] processed = preprocess.get_transform(augment=True) left_img = processed(left_img) right_img = processed(right_img) # left_img = left_img/255 - 1 # right_img = right_img/255 - 1 # left_img, rigt_img = preprocess.get_transform_unsym(left_img, right_img, [th, tw]) # left_img, right_img = left_img-1, right_img-1 # delta_h = np.floor(np.random.uniform(50,150)) # delta_w = np.floor(np.random.uniform(50,200)) delta_h = np.floor(np.random.uniform(50,180)) delta_w = np.floor(np.random.uniform(50,250)) x1_aug = random.randint(0, th - delta_h) y1_aug = random.randint(0, tw - delta_w) x2_aug = random.randint(0, th - delta_h) y2_aug = random.randint(0, tw - delta_w) right_img[:,int(x1_aug):int(x1_aug+delta_h), int(y1_aug):int(y1_aug+delta_w)] = right_img[:,int(x2_aug):int(x2_aug+delta_h), int(y2_aug):int(y2_aug+delta_w)] return [left_img.unsqueeze(0), right_img.unsqueeze(0), torch.tensor(dataL).unsqueeze(0),torch.tensor(normL)] else: w, h = left_img.size # left_img = left_img.crop((w - 1232, h - 368, w, h)) # right_img = right_img.crop((w - 1232, h - 368, w, h)) # left_img = left_img.crop((w - 1152, h - 256, w, h)) # right_img = right_img.crop((w - 1152, h - 256, w, h)) left_img = left_img.crop((w - 1152, h - 320, w, h)) right_img = right_img.crop((w - 1152, h - 320, w, h)) w1, h1 = left_img.size # dataL = dataL.crop((w - 1152, h - 256, w, h)) dataL = dataL.crop((w - 1152, h - 320, w, h)) dataL = np.ascontiguousarray(dataL, dtype=np.float32) / 256 processed = preprocess.get_transform(augment=False) left_img = processed(left_img) right_img = processed(right_img) # print(left_img, right_img, dataL) return [left_img, right_img, dataL, dataL] def __len__(self): return len(self.left) ================================================ FILE: disparity/dataloader/KITTI_submission_loader.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path import numpy as np IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath): left_fold = 'image_2/' right_fold = 'image_3/' # left_fold = 'colored_0/' # right_fold = 'colored_1/' image = [img for img in os.listdir(filepath+left_fold) if img.find('_10') > -1] left_test = [filepath+left_fold+img for img in image] right_test = [filepath+right_fold+img for img in image] return left_test, right_test ================================================ FILE: disparity/dataloader/KITTI_submission_loader2012.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path import numpy as np IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath): left_fold = 'colored_0/' right_fold = 'colored_1/' image = [img for img in os.listdir(filepath+left_fold) if img.find('_10') > -1] left_test = [filepath+left_fold+img for img in image] right_test = [filepath+right_fold+img for img in image] return left_test, right_test ================================================ FILE: disparity/dataloader/KITTIloader2012.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path import numpy as np import random IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath, arg=False): left_fold = 'colored_0/' right_fold = 'colored_1/' disp_noc = 'disp_occ/' disp_norm = 'dispnorm_occ/' image = [img for img in os.listdir(filepath+left_fold) if img.find('_10') > -1] valist = [1,15,39,65,101,113,134,154,175,4,16,40,66,102,118,139,156,180,5,19,52,82,104, 119,143,157,181,9,25,56,85,105,120,145,161,186,11,29,60,89,107,122,148,167, 188,12,31,63,95,108,128,151,170,14,32,64,97,112,132,153,171] # valist = [] train = [] val = [] for i in range(len(image)): if i in valist: val.append(image[i]) else: train.append(image[i]) random.shuffle(train) left_train = [filepath+left_fold+img for img in train] right_train = [filepath+right_fold+img for img in train] disp_train = [filepath+disp_noc+img for img in train] norm_train = [filepath+disp_norm+img for img in train] left_val = [filepath+left_fold+img for img in val] right_val = [filepath+right_fold+img for img in val] disp_val = [filepath+disp_noc+img for img in val] norm_val = [filepath+disp_norm+img for img in val] return left_train, right_train, disp_train, norm_train, left_val, right_val, disp_val, norm_val ================================================ FILE: disparity/dataloader/KITTIloader2015.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path import numpy as np import random IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath): left_fold = 'image_2/' right_fold = 'image_3/' disp_L = 'disp_occ_0/' disp_R = 'disp_occ_1/' disp_norm = 'dispnorm_occ/' image = [img for img in os.listdir(filepath+left_fold) if img.find('_10') > -1] all_index = np.arange(200) #np.random.shuffle(all_index) # vallist = all_index[:40] # val = ['{:06d}_10.png'.format(x) for x in vallist] val = [] train = [x for x in image if x not in val] random.shuffle(train) left_train = [filepath+left_fold+img for img in train] right_train = [filepath+right_fold+img for img in train] disp_train_L = [filepath+disp_L+img for img in train] disp_train_R = [filepath+disp_R+img for img in train] norm_train_L = [filepath+disp_norm+img for img in train] left_val = [filepath+left_fold+img for img in val] right_val = [filepath+right_fold+img for img in val] disp_val_L = [filepath+disp_L+img for img in val] disp_val_R = [filepath+disp_R+img for img in val] norm_val_L = [filepath+disp_norm+img for img in val] return left_train, right_train, disp_train_L, norm_train_L, left_val, right_val, disp_val_L, norm_val_L ================================================ FILE: disparity/dataloader/SceneFlowLoader_demo.py ================================================ import torch.utils.data as data import random from PIL import Image from . import preprocess import numpy as np import sys, os sys.path.append(os.path.abspath(os.path.dirname(__file__))) IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): return Image.open(path).convert('RGB') def disparity_loader(path): path_prefix = path.split('.')[0] path1 = path_prefix + '_exception_assign_minus_1.npy' path2 = path_prefix + '.npy' path3 = path_prefix + '.pfm' import os.path as ospath if ospath.exists(path1): return np.load(path1) else: if ospath.exists(path2): data = np.load(path2) else: from readpfm import readPFM data, _ = readPFM(path3) np.save(path2, data) for i in range(data.shape[0]): for j in range(data.shape[1]): if j - data[i][j] < 0: data[i][j] = -1 np.save(path1, data) return data class myImageFloder(data.Dataset): def __init__(self, left, right, left_disparity, training, normalize, loader=default_loader, dploader=disparity_loader): self.left = left self.right = right self.disp_L = left_disparity self.loader = loader self.dploader = dploader self.training = training self.normalize = normalize def __getitem__(self, index): left = self.left[index] right = self.right[index] disp_L = self.disp_L[index] left_img = self.loader(left) right_img = self.loader(right) dataL = self.dploader(disp_L) dataL = np.ascontiguousarray(dataL, dtype=np.float32) processed = preprocess.get_transform( augment=False, normalize=self.normalize) left_img = processed(left_img) right_img = processed(right_img) return left_img, right_img, dataL, left, disp_L.split( '.')[0] + '_exception_assign_minus_1.npy' def __len__(self): return len(self.left) ================================================ FILE: disparity/dataloader/SecenFlowLoader.py ================================================ import os import torch import torch.utils.data as data import torch #import torchvision.transforms as transforms import random from PIL import Image, ImageOps from . import preprocess from . import listflowfile as lt from . import readpfm as rp import numpy as np import cv2 import torch.nn.functional as F IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): image = cv2.imread(path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) return image def disparity_loader(path): return rp.readPFM(path) def random_replace(img,num,size): #random crop areas and replace to the same size random crop from the image self. #from HITNet ,it random crop the right image. #args num:num of areas to crop and replace # size: random [0,size]*[0,size] h = img.shape[0] w = img.shape[1] for i in range(num): size_ix = random.randint(0,size) size_iy = random.randint(0, size) x1 = random.randint(0, w - size_ix) y1 = random.randint(0, h - size_iy) x2 = random.randint(0, w - size_ix) y2 = random.randint(0, h - size_iy) #replace img[y1:y1 + size_iy, x1:x1 + size_ix, :] = img[y2:y2 + size_iy, x2:x2 + size_ix, :] return img class myImageFloder(data.Dataset): def __init__(self, left, right, left_disparity,right_disparity, training, loader=default_loader, dploader=disparity_loader): self.left = left self.right = right self.disp_L = left_disparity self.loader = loader self.dploader = dploader self.training = training if right_disparity is not None: self.disp_R = right_disparity else: self.disp_R = None print('len', len(self.left)) def __getitem__(self, index): left = self.left[index] right = self.right[index] disp_L = self.disp_L[index] if self.disp_R is not None: disp_R = self.disp_R[index] dataR,scaleR = self.dploader(disp_R) dataR = np.ascontiguousarray(dataR, dtype=np.float32) left_img = self.loader(left) right_img = self.loader(right) dataL, scaleL = self.dploader(disp_L) dataL = np.ascontiguousarray(dataL, dtype=np.float32) if self.training: h = left_img.shape[0] w = left_img.shape[1] th, tw = 320, 960 x1 = random.randint(0, w - tw) y1 =\ random.randint(0, h - th) left_img = left_img[y1:y1+th,x1:x1+tw,:] right_img = right_img[y1:y1+th,x1:x1+tw,:] #left_img = random_replace(left_img,5,80) dataL = dataL[y1:y1 + th, x1:x1 + tw] if dataR is not None: dataR=dataR[y1:y1 + th, x1:x1 + tw] processed = preprocess.get_transform(augment=False) #random replace #right_img = random_replace(right_img,4,5) left_img_and_d= processed(image=left_img,mask=dataL,bboxes=[],category_id=[]) left_img = left_img_and_d['image'] dataL = left_img_and_d['mask'] if dataR is not None: right_img_and_d = processed(image=right_img, mask=dataR, bboxes=[], category_id=[]) right_img = right_img_and_d['image'] dataR = right_img_and_d['mask'] else: right_img = processed(image=right_img,mask=None,bboxes=[],category_id=[])['image'] if dataR is not None: return left_img, right_img, dataL,dataR else: return left_img, right_img, dataL else: h = left_img.shape[0] w = left_img.shape[1] th, tw = 512, 960 #x1 = random.randint(0, w - tw) #y1 = random.randint(0, h - th) x1 = 0 y1 = 0 left_img = left_img[y1:y1 + th, x1:x1 + tw, :] right_img = right_img[y1:y1 + th, x1:x1 + tw, :] dataL = dataL[y1:y1 + th, x1:x1 + tw] if dataR is not None: dataR=dataR[y1:y1 + th, x1:x1 + tw] processed = preprocess.get_transform(augment=False) left_img_and_d= processed(image=left_img,mask=dataL,bboxes=[],category_id=[]) left_img = left_img_and_d['image'] dataL = left_img_and_d['mask'] if dataR is not None: right_img_and_d = processed(image=right_img, mask=dataR, bboxes=[], category_id=[]) right_img = right_img_and_d['image'] dataR = right_img_and_d['mask'] else: right_img = processed(image=right_img,mask=None,bboxes=[],category_id=[])['image'] if dataR is not None: return left_img, right_img, dataL, dataR else: return left_img, right_img, dataL def __len__(self): return len(self.left) ================================================ FILE: disparity/dataloader/SecenFlowLoader1.py ================================================ import os import torch import torch.utils.data as data import torch import torchvision.transforms as transforms import random from PIL import Image, ImageOps from . import preprocess from . import listflowfile as lt from . import readpfm as rp import numpy as np IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): return Image.open(path).convert('RGB') def disparity_loader(path): return rp.readPFM(path) class myImageFloder(data.Dataset): def __init__(self, left, right, left_disparity, training, loader=default_loader, dploader=disparity_loader): self.left = left self.right = right self.disp_L = left_disparity self.loader = loader self.dploader = dploader self.training = training def __getitem__(self, index): left = self.left[index] right = self.right[index] disp_L = self.disp_L[index] left_img = self.loader(left) right_img = self.loader(right) dataL, scaleL = self.dploader(disp_L) dataL = np.ascontiguousarray(dataL, dtype=np.float32) if self.training: w, h = left_img.size th, tw = 256, 512 # th, tw = 544, 960 x1 = random.randint(0, w - tw) y1 = random.randint(0, h - th) left_img = left_img.crop((x1, y1, x1 + tw, y1 + th)) right_img = right_img.crop((x1, y1, x1 + tw, y1 + th)) dataL = dataL[y1:y1 + th, x1:x1 + tw] processed = preprocess.get_transform(augment=False) left_img = processed(left_img) right_img = processed(right_img) return left_img, right_img, dataL else: w, h = left_img.size left_img = left_img.crop((w - 960, h - 544, w, h)) right_img = right_img.crop((w - 960, h - 544, w, h)) processed = preprocess.get_transform(augment=False) left_img = processed(left_img) right_img = processed(right_img) return left_img, right_img, dataL def __len__(self): return len(self.left) ================================================ FILE: disparity/dataloader/SecenFlowLoaderfix.py ================================================ import torch.utils.data as data import random from PIL import Image from . import preprocess # import preprocess import numpy as np import sys, os sys.path.append(os.path.abspath(os.path.dirname(__file__))) IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): return Image.open(path).convert('RGB') # def disparity_loader(path): # path_prefix = path.split('.')[0] # # print(path_prefix) # path1 = path_prefix + '_exception_assign_minus_1.npy' # path2 = path_prefix + '.npy' # path3 = path_prefix + '.pfm' # import os.path as ospath # if ospath.exists(path1): # return np.load(path1) # else: # if ospath.exists(path2): # data = np.load(path2) # else: # # from readpfm import readPFMreadPFM # from readpfm import readPFM # data, _ = readPFM(path3) # np.save(path2, data) # for i in range(data.shape[0]): # for j in range(data.shape[1]): # if j - data[i][j] < 0: # data[i][j] = -1 # np.save(path1, data) # return data def disparity_loader(path): path_prefix = path.split('.')[0] # print(path_prefix) path1 = path_prefix + '_exception_assign_minus_1.npy' path2 = path_prefix + '.npy' path3 = path_prefix + '.pfm' import os.path as ospath if ospath.exists(path1): return np.load(path1) else: # from readpfm import readPFMreadPFM from readpfm import readPFM data, _ = readPFM(path3) np.save(path2, data) for i in range(data.shape[0]): for j in range(data.shape[1]): if j - data[i][j] < 0: data[i][j] = -1 np.save(path1, data) return data class myImageFloder(data.Dataset): def __init__(self, left, right, left_disparity, right_disparity, training, normalize, loader=default_loader, dploader=disparity_loader): self.left = left self.right = right self.disp_L = left_disparity self.disp_R = right_disparity self.loader = loader self.dploader = dploader self.training = training self.normalize = normalize def __getitem__(self, index): left = self.left[index] right = self.right[index] disp_L = self.disp_L[index] disp_R = self.disp_R[index] left_img = self.loader(left) right_img = self.loader(right) dataL = self.dploader(disp_L) dataR = self.dploader(disp_R) dataL = np.ascontiguousarray(dataL, dtype=np.float32) dataR = np.ascontiguousarray(dataR, dtype=np.float32) processed = preprocess.get_transform( augment=False, normalize=self.normalize) left_img = processed(left_img) right_img = processed(right_img) return left_img, right_img, dataL, dataR def __len__(self): return len(self.left) if __name__ == '__main__': path = '/media/lxy/sdd1/stereo_coderesource/dataset_nie/SceneFlowData/frames_cleanpass/flyingthings3d_disparity/TRAIN/A/0024/left/0011.pfm' res = disparity_loader(path) print(res.shape) ================================================ FILE: disparity/dataloader/Testloader.py ================================================ import os import torch import torch.utils.data as data import torch import torchvision.transforms as transforms import random from PIL import Image, ImageOps import numpy as np #from dataloader.preprocess import preprocess import dataloader.preprocess as preprocess IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def default_loader(path): return Image.open(path).convert('RGB') def disparity_loader(path): return Image.open(path) def dataloader(filepath): left_fold = 'left/' right_fold = 'right/' left_test= [img for img in os.listdir(filepath+left_fold) if img.find('_left') > -1] left_test.sort() right_test= [img for img in os.listdir(filepath+right_fold) if img.find('_right') > -1] right_test.sort() left_test = [filepath+left_fold+img for img in left_test] right_test = [filepath+right_fold+img for img in right_test] return left_test, right_test class myImageFloder(data.Dataset): def __init__(self, left, right, loader=default_loader): self.left = left self.right = right self.loader = loader def __getitem__(self, index): left = self.left[index] right = self.right[index] print('left',index,left) print('right',index,right) left_img = self.loader(left) right_img = self.loader(right) #test not for training w, h = left_img.size left_img = left_img.crop((w - 992, h - 736, w, h)) right_img = right_img.crop((w - 992, h - 736, w, h)) # left_img = left_img.crop((w - 1232, h - 368, w, h)) # right_img = right_img.crop((w - 1232, h - 368, w, h)) w1, h1 = left_img.size #dataL = dataL.crop((w - 1232, h - 368, w, h)) processedL = preprocess.get_transform(augment=False,camera=None) processedR = preprocess.get_transform(augment=False,camera=None) left_img = processedL(left_img) right_img = processedR(right_img) return left_img, right_img def __len__(self): return len(self.left) if __name__ == '__main__': left,right=dataloader('/disk1/hyj/test_picture/819_testpic/') print(left) print(len(left)) print(right) print(len(right)) ================================================ FILE: disparity/dataloader/__init__.py ================================================ ================================================ FILE: disparity/dataloader/listflowfile.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath): filepath += '/' classes = [d for d in os.listdir(filepath) if os.path.isdir(os.path.join(filepath, d))] image = [img for img in classes if img.find('frames_cleanpass') > -1] disp = [dsp for dsp in classes if dsp.find('disparity') > -1] print(classes) print('img',image) print('disp', disp) monkaa_path = filepath + [x for x in image if 'monkaa' in x][0] monkaa_disp = filepath + [x for x in disp if 'monkaa' in x][0] monkaa_dir = os.listdir(monkaa_path) all_left_img = [] all_right_img = [] all_left_disp = [] all_right_disp = [] test_left_img = [] test_right_img = [] test_left_disp = [] for dd in monkaa_dir: for im in os.listdir(monkaa_path + '/' + dd + '/left/'): if is_image_file(monkaa_path + '/' + dd + '/left/' + im): all_left_img.append(monkaa_path + '/' + dd + '/left/' + im) all_left_disp.append(monkaa_disp + '/' + dd + '/left/' + im.split(".")[0] + '.pfm') all_right_disp.append(monkaa_disp + '/' + dd + '/right/' + im.split(".")[0] + '.pfm') for im in os.listdir(monkaa_path + '/' + dd + '/right/'): if is_image_file(monkaa_path + '/' + dd + '/right/' + im): all_right_img.append(monkaa_path + '/' + dd + '/right/' + im) flying_path = filepath + [x for x in image if x == 'frames_cleanpass'][0] flying_disp = filepath + [x for x in disp if x == 'frames_disparity'][0] flying_dir = flying_path + '/TRAIN/' subdir = ['A', 'B', 'C'] for ss in subdir: flying = os.listdir(flying_dir + ss) for ff in flying: imm_l = os.listdir(flying_dir + ss + '/' + ff + '/left/') for im in imm_l: if is_image_file(flying_dir + ss + '/' + ff + '/left/' + im): all_left_img.append(flying_dir + ss + '/' + ff + '/left/' + im) all_left_disp.append(flying_disp + '/TRAIN/' + ss + '/' + ff + '/left/' + im.split(".")[0] + '.pfm') all_right_disp.append(flying_disp + '/TRAIN/' + ss + '/' + ff + '/right/' + im.split(".")[0] + '.pfm') if is_image_file(flying_dir + ss + '/' + ff + '/right/' + im): all_right_img.append(flying_dir + ss + '/' + ff + '/right/' + im) flying_dir = flying_path + '/TEST/' subdir = ['A', 'B', 'C'] # subdir = ['C'] # print('*****************') for ss in subdir: flying = os.listdir(flying_dir + ss) for ff in flying: imm_l = os.listdir(flying_dir + ss + '/' + ff + '/left/') for im in imm_l: if is_image_file(flying_dir + ss + '/' + ff + '/left/' + im): test_left_img.append(flying_dir + ss + '/' + ff + '/left/' + im) test_left_disp.append(flying_disp + '/TEST/' + ss + '/' + ff + '/left/' + im.split(".")[0] + '.pfm') if is_image_file(flying_dir + ss + '/' + ff + '/right/' + im): test_right_img.append(flying_dir + ss + '/' + ff + '/right/' + im) driving_dir = filepath + [x for x in image if 'driving' in x][0] + '/' driving_disp = filepath + [x for x in disp if 'driving' in x][0] subdir1 = ['35mm_focallength', '15mm_focallength'] subdir2 = ['scene_backwards', 'scene_forwards'] subdir3 = ['fast', 'slow'] for i in subdir1: for j in subdir2: for k in subdir3: imm_l = os.listdir(driving_dir + i + '/' + j + '/' + k + '/left/') for im in imm_l: if is_image_file(driving_dir + i + '/' + j + '/' + k + '/left/' + im): all_left_img.append(driving_dir + i + '/' + j + '/' + k + '/left/' + im) all_left_disp.append( driving_disp + '/' + i + '/' + j + '/' + k + '/left/' + im.split(".")[0] + '.pfm') all_right_disp.append( driving_disp + '/' + i + '/' + j + '/' + k + '/right/' + im.split(".")[0] + '.pfm') if is_image_file(driving_dir + i + '/' + j + '/' + k + '/right/' + im): all_right_img.append(driving_dir + i + '/' + j + '/' + k + '/right/' + im) return all_left_img, all_right_img, all_left_disp,all_right_disp, test_left_img, test_right_img, test_left_disp ================================================ FILE: disparity/dataloader/listflowfilefix.py ================================================ import torch.utils.data as data from PIL import Image import os import os.path IMG_EXTENSIONS = [ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', ] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def dataloader(filepath): # /media/hugonie/Hhome/dataset/SceneFlowData/ # classes = [d for d in os.listdir(filepath) if os.path.isdir(os.path.join(filepath, d))] # print(classes) # image = [img for img in classes if img.find('frames_cleanpass') > -1] # print(image) # disp = [dsp for dsp in classes if dsp.find('disparity') > -1] # print(disp) # monkaa # monkaa_path = filepath + [x for x in image if 'monkaa' in x][0] # monkaa_disp = filepath + [x for x in disp if 'monkaa' in x][0] monkaa_path = filepath + '/frames_cleanpass/monkaa' monkaa_disp = filepath + '/disparity/monkaa' monkaa_dir = os.listdir(monkaa_path) all_left_img=[] all_right_img=[] all_left_disp = [] all_right_disp = [] test_left_img=[] test_right_img=[] test_left_disp = [] test_right_disp = [] for dd in monkaa_dir: for im in os.listdir(monkaa_path+'/'+dd+'/left/'): if is_image_file(monkaa_path+'/'+dd+'/left/'+im): all_left_img.append(monkaa_path+'/'+dd+'/left/'+im) all_left_disp.append(monkaa_disp+'/'+dd+'/left/'+im.split(".")[0]+'.pfm') all_right_disp.append(monkaa_disp+'/'+dd+'/right/'+im.split(".")[0]+'.pfm') for im in os.listdir(monkaa_path+'/'+dd+'/right/'): if is_image_file(monkaa_path+'/'+dd+'/right/'+im): all_right_img.append(monkaa_path+'/'+dd+'/right/'+im) # flyingthings # flying_path = filepath + [x for x in image if x == 'flyingthings3D'][0] # flying_disp = filepath + [x for x in disp if x == 'flyingthings3D'][0] flying_path = filepath + '/frames_cleanpass/flyingthings3D' flying_disp = filepath + '/disparity/flyingthings3D' flying_dir = flying_path+'/TRAIN/' subdir = ['A','B','C'] for ss in subdir: flying = os.listdir(flying_dir+ss) for ff in flying: imm_l = os.listdir(flying_dir+ss+'/'+ff+'/left/') for im in imm_l: if is_image_file(flying_dir+ss+'/'+ff+'/left/'+im): all_left_img.append(flying_dir+ss+'/'+ff+'/left/'+im) all_left_disp.append(flying_disp+'/TRAIN/'+ss+'/'+ff+'/left/'+im.split(".")[0]+'.pfm') all_right_disp.append(flying_disp+'/TRAIN/'+ss+'/'+ff+'/right/'+im.split(".")[0]+'.pfm') if is_image_file(flying_dir+ss+'/'+ff+'/right/'+im): all_right_img.append(flying_dir+ss+'/'+ff+'/right/'+im) flying_dir = flying_path+'/TEST/' subdir = ['A','B','C'] for ss in subdir: flying = os.listdir(flying_dir+ss) for ff in flying: imm_l = os.listdir(flying_dir+ss+'/'+ff+'/left/') for im in imm_l: if is_image_file(flying_dir+ss+'/'+ff+'/left/'+im): test_left_img.append(flying_dir+ss+'/'+ff+'/left/'+im) test_left_disp.append(flying_disp+'/TEST/'+ss+'/'+ff+'/left/'+im.split(".")[0]+'.pfm') test_right_disp.append(flying_disp+'/TEST/'+ss+'/'+ff+'/right/'+im.split(".")[0]+'.pfm') if is_image_file(flying_dir+ss+'/'+ff+'/right/'+im): test_right_img.append(flying_dir+ss+'/'+ff+'/right/'+im) # driving # driving_dir = filepath + [x for x in image if 'driving' in x][0] + '/' # driving_disp = filepath + [x for x in disp if 'driving' in x][0] driving_dir = filepath + '/frames_cleanpass/driving/' driving_disp = filepath + '/disparity/driving' subdir1 = ['15mm_focallength','35mm_focallength'] subdir2 = ['scene_backwards','scene_forwards'] subdir3 = ['fast','slow'] for i in subdir1: for j in subdir2: for k in subdir3: imm_l = os.listdir(driving_dir+i+'/'+j+'/'+k+'/left/') for im in imm_l: if is_image_file(driving_dir+i+'/'+j+'/'+k+'/left/'+im): all_left_img.append(driving_dir+i+'/'+j+'/'+k+'/left/'+im) all_left_disp.append(driving_disp+'/'+i+'/'+j+'/'+k+'/left/'+im.split(".")[0]+'.pfm') all_right_disp.append(driving_disp+'/'+i+'/'+j+'/'+k+'/right/'+im.split(".")[0]+'.pfm') if is_image_file(driving_dir+i+'/'+j+'/'+k+'/right/'+im): all_right_img.append(driving_dir+i+'/'+j+'/'+k+'/right/'+im) return all_left_img, all_right_img, all_left_disp,all_right_disp, test_left_img, test_right_img, test_left_disp, test_right_disp ================================================ FILE: disparity/dataloader/preprocess.py ================================================ import torch #import torchvision.transforms as transforms import random import cv2 import albumentations as A from albumentations.pytorch import ToTensorV2 __imagenet_stats = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]} #__imagenet_stats = {'mean': [0.5, 0.5, 0.5], # 'std': [0.5, 0.5, 0.5]} __imagenet_pca = { 'eigval': torch.Tensor([0.2175, 0.0188, 0.0045]), 'eigvec': torch.Tensor([ [-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203], ]) } def totensor_normalize(): return A.Compose([ # A.Normalize( # mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]), ToTensorV2(always_apply=True) ],p=1) def augmentv1(): photometric = [ A.Blur(p=0.5), A.HueSaturationValue(20,30,20,p=0.5), A.RandomBrightnessContrast(0.2,p=0.5), A.RandomGamma(p=0.5), #A.ISONoise(p=1), A.GaussNoise(p=0.5), # A.Normalize( # mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225], # ), ToTensorV2() ] geometric = [ # A.OpticalDistortion(distort_limit=0.3, shift_limit=0.3,p=1) A.ShiftScaleRotate(shift_limit=0.01,scale_limit=0.01,rotate_limit=5,p=0.5) #A.ShiftScaleRotate(shift_limit=0.3, scale_limit=0.3, rotate_limit=30, p=0.5) ] return A.Compose(photometric) def get_transform(augment=True): if augment: return augmentv1() else: return totensor_normalize() ================================================ FILE: disparity/dataloader/readpfm.py ================================================ import re import numpy as np import sys def readPFM(file): file = open(file, 'rb') color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header == b'PF': color = True elif header == b'Pf': color = False else: raise Exception('Not a PFM file.') dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8')) if dim_match: width, height = map(int, dim_match.groups()) else: raise Exception('Malformed PFM header.') scale = float(file.readline().rstrip()) if scale < 0: # little-endian endian = '<' scale = -scale else: endian = '>' # big-endian data = np.fromfile(file, endian + 'f') shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) file.close() return data, scale ================================================ FILE: disparity/eval/__init__.py ================================================ ================================================ FILE: disparity/eval/kitti/README.md ================================================ Reference:
https://github.com/prclibo/kitti_eval # kitti_eval `evaluate_object_3d_offline.cpp`evaluates your KITTI detection locally on your own computer using your validation data selected from KITTI training dataset, with the following metrics: - overlap on image (AP) - oriented overlap on image (AOS) - overlap on ground-plane (AP) - overlap in 3D (AP) Compile `evaluate_object_3d_offline.cpp` with dependency of Boost and Linux `dirent.h` (You should already have it under most Linux). Run the evalutaion by: ./evaluate_object_3d_offline groundtruth_dir result_dir Note that you don't have to detect over all KITTI training data. The evaluator only evaluates samples whose result files exist. ### Updates - June, 2017: * Fixed the bug of detection box filtering based on min height according to KITTI's note on 25.04.2017. ================================================ FILE: disparity/eval/kitti/compile.sh ================================================ #/bin/bash g++ -o evaluate_object_3d_offline evaluate_object_3d_offline.cpp ================================================ FILE: disparity/eval/kitti/eval.sh ================================================ echo "evalutating $1 ..." ./evaluate_object_3d_offline /mnt/backup/project/ylchen/dataset/KITTI_DATASET/kitti_detection/training/label_2 $1 ================================================ FILE: disparity/eval/kitti/eval_05.sh ================================================ echo "evalutating $1 ..." ./evaluate_object_3d_offline_05 ../../../data/kitti/training/label_2/ $1 ================================================ FILE: disparity/eval/kitti/evaluate_object_3d_offline.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mail.h" BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian) typedef boost::geometry::model::polygon > Polygon; using namespace std; /*======================================================================= STATIC EVALUATION PARAMETERS =======================================================================*/ // holds the number of test images on the server const int32_t N_TESTIMAGES = 7518; // easy, moderate and hard evaluation level enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2}; // evaluation metrics: image, ground or 3D enum METRIC{IMAGE=0, GROUND=1, BOX3D=2}; // evaluation parameter const int32_t MIN_HEIGHT[3] = {40, 25, 25}; // minimum height for evaluated groundtruth/detections const int32_t MAX_OCCLUSION[3] = {0, 1, 2}; // maximum occlusion level of the groundtruth used for evaluation const double MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation // evaluated object classes enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2}; const int NUM_CLASS = 3; // parameters varying per class vector CLASS_NAMES; // the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation //const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}}; //const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}; const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}}; // no. of recall steps that should be evaluated (discretized) const double N_SAMPLE_PTS = 41; // initialize class names void initGlobals () { CLASS_NAMES.push_back("car"); CLASS_NAMES.push_back("pedestrian"); CLASS_NAMES.push_back("cyclist"); } /*======================================================================= DATA TYPES FOR EVALUATION =======================================================================*/ // holding data needed for precision-recall and precision-aos struct tPrData { vector v; // detection score for computing score thresholds double similarity; // orientation similarity int32_t tp; // true positives int32_t fp; // false positives int32_t fn; // false negatives tPrData () : similarity(0), tp(0), fp(0), fn(0) {} }; // holding bounding boxes for ground truth and detections struct tBox { string type; // object type as car, pedestrian or cyclist,... double x1; // left corner double y1; // top corner double x2; // right corner double y2; // bottom corner double alpha; // image orientation tBox (string type, double x1,double y1,double x2,double y2,double alpha) : type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {} }; // holding ground truth data struct tGroundtruth { tBox box; // object type, box, orientation double truncation; // truncation 0..1 int32_t occlusion; // occlusion 0,1,2 (non, partly, fully) double ry; double t1, t2, t3; double h, w, l; tGroundtruth () : box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {} tGroundtruth (tBox box,double truncation,int32_t occlusion) : box(box),truncation(truncation),occlusion(occlusion) {} tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) : box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {} }; // holding detection data struct tDetection { tBox box; // object type, box, orientation double thresh; // detection score double ry; double t1, t2, t3; double h, w, l; tDetection (): box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {} tDetection (tBox box,double thresh) : box(box),thresh(thresh) {} tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) : box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {} }; /*======================================================================= FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS =======================================================================*/ vector indices; vector loadDetections(string file_name, bool &compute_aos, vector &eval_image, vector &eval_ground, vector &eval_3d, bool &success) { // holds all detections (ignored detections are indicated by an index vector vector detections; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return detections; } while (!feof(fp)) { tDetection d; double trash; char str[255]; if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1, &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3, &d.ry, &d.thresh)==16) { // d.thresh = 1; d.box.type = str; detections.push_back(d); // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid if(d.box.alpha == -10) compute_aos = false; // a class is only evaluated if it is detected at least once for (int c = 0; c < NUM_CLASS; c++) { if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) { if (!eval_image[c] && d.box.x1 >= 0) eval_image[c] = true; if (!eval_ground[c] && d.t1 != -1000) eval_ground[c] = true; if (!eval_3d[c] && d.t2 != -1000) eval_3d[c] = true; break; } } } } fclose(fp); success = true; return detections; } vector loadGroundtruth(string file_name,bool &success) { // holds all ground truth (ignored ground truth is indicated by an index vector vector groundtruth; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return groundtruth; } while (!feof(fp)) { tGroundtruth g; char str[255]; if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &g.truncation, &g.occlusion, &g.box.alpha, &g.box.x1, &g.box.y1, &g.box.x2, &g.box.y2, &g.h, &g.w, &g.l, &g.t1, &g.t2, &g.t3, &g.ry )==15) { g.box.type = str; groundtruth.push_back(g); } } fclose(fp); success = true; return groundtruth; } void saveStats (const vector &precision, const vector &aos, FILE *fp_det, FILE *fp_ori) { // save precision to file if(precision.empty()) return; for (int32_t i=0; i Polygon toPolygon(const T& g) { using namespace boost::numeric::ublas; using namespace boost::geometry; matrix mref(2, 2); mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry); mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry); static int count = 0; matrix corners(2, 4); double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2, g.w / 2, -g.w / 2, -g.w / 2, g.w / 2}; std::copy(data, data + 8, corners.data().begin()); matrix gc = prod(mref, corners); for (int i = 0; i < 4; ++i) { gc(0, i) += g.t1; gc(1, i) += g.t3; } double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}}; Polygon poly; append(poly, points); return poly; } // measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz) inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double inter_area = in.empty() ? 0 : area(in.front()); double union_area = area(un.front()); double o; if(criterion==-1) // union o = inter_area / union_area; else if(criterion==0) // bbox_a o = inter_area / area(dp); else if(criterion==1) // bbox_b o = inter_area / area(gp); return o; } // measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz) inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double ymax = min(d.t2, g.t2); double ymin = max(d.t2 - d.h, g.t2 - g.h); double inter_area = in.empty() ? 0 : area(in.front()); double inter_vol = inter_area * max(0.0, ymax - ymin); double det_vol = d.h * d.l * d.w; double gt_vol = g.h * g.l * g.w; double o; if(criterion==-1) // union o = inter_vol / (det_vol + gt_vol - inter_vol); else if(criterion==0) // bbox_a o = inter_vol / det_vol; else if(criterion==1) // bbox_b o = inter_vol / gt_vol; return o; } vector getThresholds(vector &v, double n_groundtruth){ // holds scores needed to compute N_SAMPLE_PTS recall values vector t; // sort scores in descending order // (highest score is assumed to give best/most confident detections) sort(v.begin(), v.end(), greater()); // get scores for linearly spaced recall double current_recall = 0; for(int32_t i=0; i >, const vector &det, vector &ignored_gt, vector &dc, vector &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){ // extract ground truth bounding boxes for current evaluation class for(int32_t i=0;iMAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height >, const vector &det, const vector &dc, const vector &ignored_gt, const vector &ignored_det, bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){ tPrData stat = tPrData(); const double NO_DETECTION = -10000000; vector delta; // holds angular difference for TPs (needed for AOS evaluation) vector assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth assigned_detection.assign(det.size(), false); vector ignored_threshold; ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed // detections with a low score are ignored for computing precision (needs FP) if(compute_fp) for(int32_t i=0; i 0.5) (logical len(det)) =======================================================================*/ int32_t det_idx = -1; double valid_detection = NO_DETECTION; double max_overlap = 0; // search for a possible detection bool assigned_ignored_det = false; for(int32_t j=0; jMIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){ det_idx = j; valid_detection = det[j].thresh; } // for computing pr curve values, the candidate with the greatest overlap is considered // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){ max_overlap = overlap; det_idx = j; valid_detection = 1; assigned_ignored_det = false; } else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){ det_idx = j; valid_detection = 1; assigned_ignored_det = true; } } /*======================================================================= compute TP, FP and FN =======================================================================*/ // nothing was assigned to this valid ground truth if(valid_detection==NO_DETECTION && ignored_gt[i]==0) { stat.fn++; } // only evaluate valid ground truth <=> detection assignments (considering difficulty level) else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1)) assigned_detection[det_idx] = true; // found a valid true positive else if(valid_detection!=NO_DETECTION){ // write highest score to threshold vector stat.tp++; stat.v.push_back(det[det_idx].thresh); // compute angular difference of detection and ground truth if valid detection orientation was provided if(compute_aos) delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha); // clean up assigned_detection[det_idx] = true; } } // if FP are requested, consider stuff area if(compute_fp){ // count fp for(int32_t i=0; iMIN_OVERLAP[metric][current_class]){ assigned_detection[j] = true; nstuff++; } } } // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas stat.fp -= nstuff; // if all orientation values are valid, the AOS is computed if(compute_aos){ vector tmp; // FP have a similarity of 0, for all TP compute AOS tmp.assign(stat.fp, 0); for(int32_t i=0; i0 || stat.fp>0) stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0); // there was neither a FP nor a TP, so the similarity is ignored in the evaluation else stat.similarity = -1; } } return stat; } /*======================================================================= EVALUATE CLASS-WISE =======================================================================*/ bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class, const vector< vector > &groundtruth, const vector< vector > &detections, bool compute_aos, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), vector &precision, vector &aos, DIFFICULTY difficulty, METRIC metric) { assert(groundtruth.size() == detections.size()); // init int32_t n_gt=0; // total no. of gt (denominator of recall) vector v, thresholds; // detection scores, evaluated for recall discretization vector< vector > ignored_gt, ignored_det; // index of ignored gt detection for current class/difficulty vector< vector > dontcare; // index of dontcare areas, included in ground truth // for all test images do for (int32_t i=0; i i_gt, i_det; vector dc; // only evaluate objects of current class and ignore occluded, truncated objects cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty); ignored_gt.push_back(i_gt); ignored_det.push_back(i_det); dontcare.push_back(dc); // compute statistics to get recall values tPrData pr_tmp = tPrData(); pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric); // add detection scores to vector over all images for(int32_t j=0; j pr; pr.assign(thresholds.size(),tPrData()); for (int32_t i=0; i recall; precision.assign(N_SAMPLE_PTS, 0); if(compute_aos) aos.assign(N_SAMPLE_PTS, 0); double r=0; for (int32_t i=0; i vals[],bool is_aos, FILE* res_fp){ char command[1024]; // save plot data to file FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w"); printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str()); for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++) fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]); fclose(fp); float sum[3] = {0, 0, 0}; for (int v = 0; v < 3; ++v) for (int i = 0; i < vals[v].size(); i = i + 4) sum[v] += vals[v][i]; printf("%s AP: %f %f %f\n", file_name.c_str(), sum[0] / 11 * 100, sum[1] / 11 * 100, sum[2] / 11 * 100); fprintf(res_fp, "%s AP: %f %f %f\n", file_name.c_str(), sum[0] / 11 * 100, sum[1] / 11 * 100, sum[2] / 11 * 100); // create png + eps for (int32_t j=0; j<2; j++) { // open file FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w"); // save gnuplot instructions if (j==0) { fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n"); fprintf(fp,"set output \"%s.png\"\n",file_name.c_str()); } else { fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n"); fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str()); } // set labels and ranges fprintf(fp,"set size ratio 0.7\n"); fprintf(fp,"set xrange [0:1]\n"); fprintf(fp,"set yrange [0:1]\n"); fprintf(fp,"set xlabel \"Recall\"\n"); if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n"); else fprintf(fp,"set ylabel \"Orientation Similarity\"\n"); obj_type[0] = toupper(obj_type[0]); fprintf(fp,"set title \"%s\"\n",obj_type.c_str()); // line width int32_t lw = 5; if (j==0) lw = 3; // plot error curve fprintf(fp,"plot "); fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw); // close file fclose(fp); // run gnuplot => create png + eps sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str()); system(command); } // create pdf and crop sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str()); system(command); } vector getEvalIndices(const string& result_dir) { DIR* dir; dirent* entity; dir = opendir(result_dir.c_str()); if (dir) { while (entity = readdir(dir)) { string path(entity->d_name); int32_t len = path.size(); if (len < 10) continue; int32_t index = atoi(path.substr(len - 10, 10).c_str()); indices.push_back(index); } } return indices; } bool eval(string gt_dir, string result_dir, Mail* mail){ // set some global parameters initGlobals(); // ground truth and result directories // string gt_dir = "data/object/label_2"; // string result_dir = "results/" + result_sha; string plot_dir = result_dir + "/plot"; FILE* res_fp = fopen((result_dir + "/result.txt").c_str(), "w"); // create output directories system(("mkdir " + plot_dir).c_str()); // hold detections and ground truth in memory vector< vector > groundtruth; vector< vector > detections; // holds wether orientation similarity shall be computed (might be set to false while loading detections) // and which labels where provided by this submission bool compute_aos=true; vector eval_image(NUM_CLASS, false); vector eval_ground(NUM_CLASS, false); vector eval_3d(NUM_CLASS, false); // for all images read groundtruth and detections mail->msg("Loading detections..."); std::vector indices = getEvalIndices(result_dir + "/data/" ); printf("number of files for evaluation: %d\n", (int)indices.size()); fprintf(res_fp, "number of files for evaluation: %d\n", (int)indices.size()); for (int32_t i=0; i gt = loadGroundtruth(gt_dir + "/" + file_name,gt_success); vector det = loadDetections(result_dir + "/data/" + file_name, compute_aos, eval_image, eval_ground, eval_3d, det_success); groundtruth.push_back(gt); detections.push_back(det); // check for errors if (!gt_success) { mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name); return false; } if (!det_success) { mail->msg("ERROR: Couldn't read: %s", file_name); return false; } } mail->msg(" done."); // holds pointers for result files FILE *fp_det=0, *fp_ori=0; // eval image 2D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_image[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w"); if(compute_aos) fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0, res_fp); if(compute_aos){ saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1, res_fp); fclose(fp_ori); } } } printf("Finished 2D bounding box eval.\n"); // don't evaluate AOS for birdview boxes and 3D boxes compute_aos = false; // eval bird's eye view bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_ground[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w"); vector precision[3], aos[3]; printf("Going to eval ground for class: %s\n", CLASS_NAMES[c].c_str()); if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0, res_fp); } } printf("Finished Birdeye eval.\n"); // eval 3D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_3d[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w"); vector precision[3], aos[3]; printf("Going to eval 3D box for class: %s\n", CLASS_NAMES[c].c_str()); if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0, res_fp); } } printf("Finished 3D bounding box eval.\n"); fclose(res_fp); // success return true; } int32_t main (int32_t argc,char *argv[]) { // we need 2 or 4 arguments! if (argc!=3) { cout << "Usage: ./eval_detection_3d_offline gt_dir result_dir" << endl; return 1; } // read arguments string gt_dir = argv[1]; string result_dir = argv[2]; // init notification mail Mail *mail; mail = new Mail(); mail->msg("Thank you for participating in our evaluation!"); // run evaluation if (eval(gt_dir, result_dir, mail)) { mail->msg("Your evaluation results are available at:"); mail->msg(result_dir.c_str()); } else { system(("rm -r " + result_dir + "/plot").c_str()); mail->msg("An error occured while processing your results."); } // send mail and exit delete mail; return 0; } ================================================ FILE: disparity/eval/kitti/mail.h ================================================ #ifndef MAIL_H #define MAIL_H #include #include #include class Mail { public: Mail (std::string email = "") { if (email.compare("")) { mail = popen("/usr/lib/sendmail -t -f noreply@cvlibs.net","w"); fprintf(mail,"To: %s\n", email.c_str()); fprintf(mail,"From: noreply@cvlibs.net\n"); fprintf(mail,"Subject: KITTI Evaluation Benchmark\n"); fprintf(mail,"\n\n"); } else { mail = 0; } } ~Mail() { if (mail) { pclose(mail); } } void msg (const char *format, ...) { va_list args; va_start(args,format); if (mail) { vfprintf(mail,format,args); fprintf(mail,"\n"); } vprintf(format,args); printf("\n"); va_end(args); } private: FILE *mail; }; #endif ================================================ FILE: disparity/eval/kitti-object-eval-python/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: disparity/eval/kitti-object-eval-python/LICENSE ================================================ MIT License Copyright (c) 2018 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: disparity/eval/kitti-object-eval-python/README.md ================================================ # kitti-object-eval-python Fast kitti object detection eval in python(finish eval in less than 10 second), support 2d/bev/3d/aos. , support coco-style AP. If you use command line interface, numba need some time to compile jit functions. _WARNING_: The "coco" isn't official metrics. Only "AP(Average Precision)" is. ## Dependencies Only support python 3.6+, need `numpy`, `skimage`, `numba`, `fire`, `scipy`. If you have Anaconda, just install `cudatoolkit` in anaconda. Otherwise, please reference to this [page](https://github.com/numba/numba#custom-python-environments) to set up llvm and cuda for numba. * Install by conda: ``` conda install -c numba cudatoolkit=x.x (8.0, 9.0, 10.0, depend on your environment) ``` ## Usage * commandline interface: ``` python evaluate.py evaluate --label_path=/path/to/your_gt_label_folder --result_path=/path/to/your_result_folder --label_split_file=/path/to/val.txt --current_class=0 --coco=False ``` * python interface: ```Python import kitti_common as kitti from eval import get_official_eval_result, get_coco_eval_result def _read_imageset_file(path): with open(path, 'r') as f: lines = f.readlines() return [int(line) for line in lines] det_path = "/path/to/your_result_folder" dt_annos = kitti.get_label_annos(det_path) gt_path = "/path/to/your_gt_label_folder" gt_split_file = "/path/to/val.txt" # from https://xiaozhichen.github.io/files/mv3d/imagesets.tar.gz val_image_ids = _read_imageset_file(gt_split_file) gt_annos = kitti.get_label_annos(gt_path, val_image_ids) print(get_official_eval_result(gt_annos, dt_annos, 0)) # 6s in my computer print(get_coco_eval_result(gt_annos, dt_annos, 0)) # 18s in my computer ``` ================================================ FILE: disparity/eval/kitti-object-eval-python/eval.py ================================================ import io as sysio import time import numba import numpy as np from scipy.interpolate import interp1d from rotate_iou import rotate_iou_gpu_eval def get_mAP(prec): sums = 0 for i in range(0, len(prec), 4): sums += prec[i] return sums / 11 * 100 @numba.jit def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41): scores.sort() scores = scores[::-1] current_recall = 0 thresholds = [] for i, score in enumerate(scores): l_recall = (i + 1) / num_gt if i < (len(scores) - 1): r_recall = (i + 2) / num_gt else: r_recall = l_recall if (((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1))): continue # recall = l_recall thresholds.append(score) current_recall += 1 / (num_sample_pts - 1.0) # print(len(thresholds), len(scores), num_gt) return thresholds def clean_data(gt_anno, dt_anno, current_class, difficulty): CLASS_NAMES = [ 'car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer' ] MIN_HEIGHT = [40, 25, 25] MAX_OCCLUSION = [0, 1, 2] MAX_TRUNCATION = [0.15, 0.3, 0.5] dc_bboxes, ignored_gt, ignored_dt = [], [], [] current_cls_name = CLASS_NAMES[current_class].lower() num_gt = len(gt_anno["name"]) num_dt = len(dt_anno["name"]) num_valid_gt = 0 for i in range(num_gt): bbox = gt_anno["bbox"][i] gt_name = gt_anno["name"][i].lower() height = bbox[3] - bbox[1] valid_class = -1 if (gt_name == current_cls_name): valid_class = 1 elif (current_cls_name == "Pedestrian".lower() and "Person_sitting".lower() == gt_name): valid_class = 0 elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name): valid_class = 0 else: valid_class = -1 ignore = False if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty]) or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty]) or (height <= MIN_HEIGHT[difficulty])): # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1: ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) num_valid_gt += 1 elif (valid_class == 0 or (ignore and (valid_class == 1))): ignored_gt.append(1) else: ignored_gt.append(-1) # for i in range(num_gt): if gt_anno["name"][i] == "DontCare": dc_bboxes.append(gt_anno["bbox"][i]) for i in range(num_dt): if (dt_anno["name"][i].lower() == current_cls_name): valid_class = 1 else: valid_class = -1 height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1]) if height < MIN_HEIGHT[difficulty]: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes @numba.jit(nopython=True) def image_box_overlap(boxes, query_boxes, criterion=-1): N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])) for n in range(N): iw = (min(boxes[n, 2], query_boxes[k, 2]) - max( boxes[n, 0], query_boxes[k, 0])) if iw > 0: ih = (min(boxes[n, 3], query_boxes[k, 3]) - max( boxes[n, 1], query_boxes[k, 1])) if ih > 0: if criterion == -1: ua = ( (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih) elif criterion == 0: ua = ((boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])) elif criterion == 1: ua = qbox_area else: ua = 1.0 overlaps[n, k] = iw * ih / ua return overlaps def bev_box_overlap(boxes, qboxes, criterion=-1): riou = rotate_iou_gpu_eval(boxes, qboxes, criterion) return riou @numba.jit(nopython=True, parallel=True) def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1, z_axis=1, z_center=1.0): """ z_axis: the z (height) axis. z_center: unified z (height) center of box. """ N, K = boxes.shape[0], qboxes.shape[0] for i in range(N): for j in range(K): if rinc[i, j] > 0: min_z = min( boxes[i, z_axis] + boxes[i, z_axis + 3] * (1 - z_center), qboxes[j, z_axis] + qboxes[j, z_axis + 3] * (1 - z_center)) max_z = max( boxes[i, z_axis] - boxes[i, z_axis + 3] * z_center, qboxes[j, z_axis] - qboxes[j, z_axis + 3] * z_center) iw = min_z - max_z if iw > 0: area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] inc = iw * rinc[i, j] if criterion == -1: ua = (area1 + area2 - inc) elif criterion == 0: ua = area1 elif criterion == 1: ua = area2 else: ua = 1.0 rinc[i, j] = inc / ua else: rinc[i, j] = 0.0 def d3_box_overlap(boxes, qboxes, criterion=-1, z_axis=1, z_center=1.0): """kitti camera format z_axis=1. """ bev_axes = list(range(7)) bev_axes.pop(z_axis + 3) bev_axes.pop(z_axis) rinc = rotate_iou_gpu_eval(boxes[:, bev_axes], qboxes[:, bev_axes], 2) d3_box_overlap_kernel(boxes, qboxes, rinc, criterion, z_axis, z_center) return rinc @numba.jit(nopython=True) def compute_statistics_jit(overlaps, gt_datas, dt_datas, ignored_gt, ignored_det, dc_bboxes, metric, min_overlap, thresh=0, compute_fp=False, compute_aos=False): det_size = dt_datas.shape[0] gt_size = gt_datas.shape[0] dt_scores = dt_datas[:, -1] dt_alphas = dt_datas[:, 4] gt_alphas = gt_datas[:, 4] dt_bboxes = dt_datas[:, :4] # gt_bboxes = gt_datas[:, :4] assigned_detection = [False] * det_size ignored_threshold = [False] * det_size if compute_fp: for i in range(det_size): if (dt_scores[i] < thresh): ignored_threshold[i] = True NO_DETECTION = -10000000 tp, fp, fn, similarity = 0, 0, 0, 0 # thresholds = [0.0] # delta = [0.0] thresholds = np.zeros((gt_size, )) thresh_idx = 0 delta = np.zeros((gt_size, )) delta_idx = 0 for i in range(gt_size): if ignored_gt[i] == -1: continue det_idx = -1 valid_detection = NO_DETECTION max_overlap = 0 assigned_ignored_det = False for j in range(det_size): if (ignored_det[j] == -1): continue if (assigned_detection[j]): continue if (ignored_threshold[j]): continue overlap = overlaps[j, i] dt_score = dt_scores[j] if (not compute_fp and (overlap > min_overlap) and dt_score > valid_detection): det_idx = j valid_detection = dt_score elif (compute_fp and (overlap > min_overlap) and (overlap > max_overlap or assigned_ignored_det) and ignored_det[j] == 0): max_overlap = overlap det_idx = j valid_detection = 1 assigned_ignored_det = False elif (compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1): det_idx = j valid_detection = 1 assigned_ignored_det = True if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: fn += 1 elif ((valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)): assigned_detection[det_idx] = True elif valid_detection != NO_DETECTION: # only a tp add a threshold. tp += 1 # thresholds.append(dt_scores[det_idx]) thresholds[thresh_idx] = dt_scores[det_idx] thresh_idx += 1 if compute_aos: # delta.append(gt_alphas[i] - dt_alphas[det_idx]) delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] delta_idx += 1 assigned_detection[det_idx] = True if compute_fp: for i in range(det_size): if (not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i])): fp += 1 nstuff = 0 if metric == 0: overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) for i in range(dc_bboxes.shape[0]): for j in range(det_size): if (assigned_detection[j]): continue if (ignored_det[j] == -1 or ignored_det[j] == 1): continue if (ignored_threshold[j]): continue if overlaps_dt_dc[j, i] > min_overlap: assigned_detection[j] = True nstuff += 1 fp -= nstuff if compute_aos: tmp = np.zeros((fp + delta_idx, )) # tmp = [0] * fp for i in range(delta_idx): tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 # tmp.append((1.0 + np.cos(delta[i])) / 2.0) # assert len(tmp) == fp + tp # assert len(delta) == tp if tp > 0 or fp > 0: similarity = np.sum(tmp) else: similarity = -1 return tp, fp, fn, similarity, thresholds[:thresh_idx] def get_split_parts(num, num_part): same_part = num // num_part remain_num = num % num_part if remain_num == 0: return [same_part] * num_part else: return [same_part] * num_part + [remain_num] @numba.jit(nopython=True) def fused_compute_statistics(overlaps, pr, gt_nums, dt_nums, dc_nums, gt_datas, dt_datas, dontcares, ignored_gts, ignored_dets, metric, min_overlap, thresholds, compute_aos=False): gt_num = 0 dt_num = 0 dc_num = 0 for i in range(gt_nums.shape[0]): for t, thresh in enumerate(thresholds): overlap = overlaps[dt_num:dt_num + dt_nums[i], gt_num:gt_num + gt_nums[i]] gt_data = gt_datas[gt_num:gt_num + gt_nums[i]] dt_data = dt_datas[dt_num:dt_num + dt_nums[i]] ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]] ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]] dontcare = dontcares[dc_num:dc_num + dc_nums[i]] tp, fp, fn, similarity, _ = compute_statistics_jit( overlap, gt_data, dt_data, ignored_gt, ignored_det, dontcare, metric, min_overlap=min_overlap, thresh=thresh, compute_fp=True, compute_aos=compute_aos) pr[t, 0] += tp pr[t, 1] += fp pr[t, 2] += fn if similarity != -1: pr[t, 3] += similarity gt_num += gt_nums[i] dt_num += dt_nums[i] dc_num += dc_nums[i] def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50, z_axis=1, z_center=1.0): """fast iou algorithm. this function can be used independently to do result analysis. Args: gt_annos: dict, must from get_label_annos() in kitti_common.py dt_annos: dict, must from get_label_annos() in kitti_common.py metric: eval type. 0: bbox, 1: bev, 2: 3d num_parts: int. a parameter for fast calculate algorithm z_axis: height axis. kitti camera use 1, lidar use 2. """ assert len(gt_annos) == len(dt_annos) total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0) total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0) num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) parted_overlaps = [] example_idx = 0 bev_axes = list(range(3)) bev_axes.pop(z_axis) for num_part in split_parts: gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] if metric == 0: gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0) dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0) overlap_part = image_box_overlap(gt_boxes, dt_boxes) elif metric == 1: loc = np.concatenate( [a["location"][:, bev_axes] for a in gt_annos_part], 0) dims = np.concatenate( [a["dimensions"][:, bev_axes] for a in gt_annos_part], 0) rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate( [a["location"][:, bev_axes] for a in dt_annos_part], 0) dims = np.concatenate( [a["dimensions"][:, bev_axes] for a in dt_annos_part], 0) rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = bev_box_overlap(gt_boxes, dt_boxes).astype(np.float64) elif metric == 2: loc = np.concatenate([a["location"] for a in gt_annos_part], 0) dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate([a["location"] for a in dt_annos_part], 0) dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = d3_box_overlap( gt_boxes, dt_boxes, z_axis=z_axis, z_center=z_center).astype(np.float64) else: raise ValueError("unknown metric") parted_overlaps.append(overlap_part) example_idx += num_part overlaps = [] example_idx = 0 for j, num_part in enumerate(split_parts): gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] gt_num_idx, dt_num_idx = 0, 0 for i in range(num_part): gt_box_num = total_gt_num[example_idx + i] dt_box_num = total_dt_num[example_idx + i] overlaps.append( parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num, dt_num_idx:dt_num_idx + dt_box_num]) gt_num_idx += gt_box_num dt_num_idx += dt_box_num example_idx += num_part return overlaps, parted_overlaps, total_gt_num, total_dt_num def _prepare_data(gt_annos, dt_annos, current_class, difficulty): gt_datas_list = [] dt_datas_list = [] total_dc_num = [] ignored_gts, ignored_dets, dontcares = [], [], [] total_num_valid_gt = 0 for i in range(len(gt_annos)): rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) ignored_dets.append(np.array(ignored_det, dtype=np.int64)) if len(dc_bboxes) == 0: dc_bboxes = np.zeros((0, 4)).astype(np.float64) else: dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) total_dc_num.append(dc_bboxes.shape[0]) dontcares.append(dc_bboxes) total_num_valid_gt += num_valid_gt gt_datas = np.concatenate( [gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1) dt_datas = np.concatenate([ dt_annos[i]["bbox"], dt_annos[i]["alpha"][..., np.newaxis], dt_annos[i]["score"][..., np.newaxis] ], 1) gt_datas_list.append(gt_datas) dt_datas_list.append(dt_datas) total_dc_num = np.stack(total_dc_num, axis=0) return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) def eval_class(gt_annos, dt_annos, current_classes, difficultys, metric, min_overlaps, compute_aos=False, z_axis=1, z_center=1.0, num_parts=50): """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. Args: gt_annos: dict, must from get_label_annos() in kitti_common.py dt_annos: dict, must from get_label_annos() in kitti_common.py current_class: int, 0: car, 1: pedestrian, 2: cyclist difficulty: int. eval difficulty, 0: easy, 1: normal, 2: hard metric: eval type. 0: bbox, 1: bev, 2: 3d min_overlap: float, min overlap. official: [[0.7, 0.5, 0.5], [0.7, 0.5, 0.5], [0.7, 0.5, 0.5]] format: [metric, class]. choose one from matrix above. num_parts: int. a parameter for fast calculate algorithm Returns: dict of recall, precision and aos """ assert len(gt_annos) == len(dt_annos) num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) rets = calculate_iou_partly( dt_annos, gt_annos, metric, num_parts, z_axis=z_axis, z_center=z_center) overlaps, parted_overlaps, total_dt_num, total_gt_num = rets N_SAMPLE_PTS = 41 num_minoverlap = len(min_overlaps) num_class = len(current_classes) num_difficulty = len(difficultys) precision = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) recall = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) all_thresholds = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) for m, current_class in enumerate(current_classes): for l, difficulty in enumerate(difficultys): rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) = rets for k, min_overlap in enumerate(min_overlaps[:, metric, m]): thresholdss = [] for i in range(len(gt_annos)): rets = compute_statistics_jit( overlaps[i], gt_datas_list[i], dt_datas_list[i], ignored_gts[i], ignored_dets[i], dontcares[i], metric, min_overlap=min_overlap, thresh=0.0, compute_fp=False) tp, fp, fn, similarity, thresholds = rets thresholdss += thresholds.tolist() thresholdss = np.array(thresholdss) thresholds = get_thresholds(thresholdss, total_num_valid_gt) thresholds = np.array(thresholds) all_thresholds[m, l, k, :len(thresholds)] = thresholds pr = np.zeros([len(thresholds), 4]) idx = 0 for j, num_part in enumerate(split_parts): gt_datas_part = np.concatenate( gt_datas_list[idx:idx + num_part], 0) dt_datas_part = np.concatenate( dt_datas_list[idx:idx + num_part], 0) dc_datas_part = np.concatenate( dontcares[idx:idx + num_part], 0) ignored_dets_part = np.concatenate( ignored_dets[idx:idx + num_part], 0) ignored_gts_part = np.concatenate( ignored_gts[idx:idx + num_part], 0) fused_compute_statistics( parted_overlaps[j], pr, total_gt_num[idx:idx + num_part], total_dt_num[idx:idx + num_part], total_dc_num[idx:idx + num_part], gt_datas_part, dt_datas_part, dc_datas_part, ignored_gts_part, ignored_dets_part, metric, min_overlap=min_overlap, thresholds=thresholds, compute_aos=compute_aos) idx += num_part for i in range(len(thresholds)): precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) if compute_aos: aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) for i in range(len(thresholds)): precision[m, l, k, i] = np.max( precision[m, l, k, i:], axis=-1) if compute_aos: aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1) ret_dict = { # "recall": recall, # [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS] "precision": precision, "orientation": aos, "thresholds": all_thresholds, "min_overlaps": min_overlaps, } return ret_dict def get_mAP_v2(prec): sums = 0 for i in range(0, prec.shape[-1], 4): sums = sums + prec[..., i] return sums / 11 * 100 def do_eval_v2(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos=False, difficultys=(0, 1, 2), z_axis=1, z_center=1.0): # min_overlaps: [num_minoverlap, metric, num_class] ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos, z_axis=z_axis, z_center=z_center) # ret: [num_class, num_diff, num_minoverlap, num_sample_points] mAP_bbox = get_mAP_v2(ret["precision"]) mAP_aos = None if compute_aos: mAP_aos = get_mAP_v2(ret["orientation"]) ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, 1, min_overlaps, z_axis=z_axis, z_center=z_center) mAP_bev = get_mAP_v2(ret["precision"]) ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps, z_axis=z_axis, z_center=z_center) mAP_3d = get_mAP_v2(ret["precision"]) return mAP_bbox, mAP_bev, mAP_3d, mAP_aos def do_eval_v3(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos=False, difficultys=(0, 1, 2), z_axis=1, z_center=1.0): # min_overlaps: [num_minoverlap, metric, num_class] types = ["bbox", "bev", "3d"] metrics = {} for i in range(3): ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, i, min_overlaps, compute_aos, z_axis=z_axis, z_center=z_center) metrics[types[i]] = ret return metrics def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos, z_axis=1, z_center=1.0): # overlap_ranges: [range, metric, num_class] min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) for i in range(overlap_ranges.shape[1]): for j in range(overlap_ranges.shape[2]): min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j]) mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval_v2( gt_annos, dt_annos, current_classes, min_overlaps, compute_aos, z_axis=z_axis, z_center=z_center) # ret: [num_class, num_diff, num_minoverlap] mAP_bbox = mAP_bbox.mean(-1) mAP_bev = mAP_bev.mean(-1) mAP_3d = mAP_3d.mean(-1) if mAP_aos is not None: mAP_aos = mAP_aos.mean(-1) return mAP_bbox, mAP_bev, mAP_3d, mAP_aos def print_str(value, *arg, sstream=None): if sstream is None: sstream = sysio.StringIO() sstream.truncate(0) sstream.seek(0) print(value, *arg, file=sstream) return sstream.getvalue() def get_official_eval_result(gt_annos, dt_annos, current_classes, difficultys=[0, 1, 2], z_axis=1, z_center=1.0): """ gt_annos and dt_annos must contains following keys: [bbox, location, dimensions, rotation_y, score] """ overlap_mod = np.array([[0.7, 0.5, 0.5, 0.7, 0.5, 0.7, 0.7, 0.7], [0.7, 0.5, 0.5, 0.7, 0.5, 0.7, 0.7, 0.7], [0.7, 0.5, 0.5, 0.7, 0.5, 0.7, 0.7, 0.7]]) overlap_easy = np.array([[0.5, 0.5, 0.5, 0.7, 0.5, 0.5, 0.5, 0.5], [0.5, 0.25, 0.25, 0.5, 0.25, 0.5, 0.5, 0.5], [0.5, 0.25, 0.25, 0.5, 0.25, 0.5, 0.5, 0.5]]) min_overlaps = np.stack([overlap_mod, overlap_easy], axis=0) # [2, 3, 5] class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', 5: 'car', 6: 'tractor', 7: 'trailer', } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int min_overlaps = min_overlaps[:, :, current_classes] result = '' # check whether alpha is valid compute_aos = False for anno in dt_annos: if anno['alpha'].shape[0] != 0: if anno['alpha'][0] != -10: compute_aos = True break metrics = do_eval_v3( gt_annos, dt_annos, current_classes, min_overlaps, compute_aos, difficultys, z_axis=z_axis, z_center=z_center) for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] for i in range(min_overlaps.shape[0]): mAPbbox = get_mAP_v2(metrics["bbox"]["precision"][j, :, i]) mAPbbox = ", ".join(f"{v:.2f}" for v in mAPbbox) mAPbev = get_mAP_v2(metrics["bev"]["precision"][j, :, i]) mAPbev = ", ".join(f"{v:.2f}" for v in mAPbev) mAP3d = get_mAP_v2(metrics["3d"]["precision"][j, :, i]) mAP3d = ", ".join(f"{v:.2f}" for v in mAP3d) result += print_str( (f"{class_to_name[curcls]} " "AP(Average Precision)@{:.2f}, {:.2f}, {:.2f}:".format(*min_overlaps[i, :, j]))) result += print_str(f"bbox AP:{mAPbbox}") result += print_str(f"bev AP:{mAPbev}") result += print_str(f"3d AP:{mAP3d}") if compute_aos: mAPaos = get_mAP_v2(metrics["bbox"]["orientation"][j, :, i]) mAPaos = ", ".join(f"{v:.2f}" for v in mAPaos) result += print_str(f"aos AP:{mAPaos}") return result def get_coco_eval_result(gt_annos, dt_annos, current_classes, z_axis=1, z_center=1.0): class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', 5: 'car', 6: 'tractor', 7: 'trailer', } class_to_range = { 0: [0.5, 1.0, 0.05], 1: [0.25, 0.75, 0.05], 2: [0.25, 0.75, 0.05], 3: [0.5, 1.0, 0.05], 4: [0.25, 0.75, 0.05], 5: [0.5, 1.0, 0.05], 6: [0.5, 1.0, 0.05], 7: [0.5, 1.0, 0.05], } class_to_range = { 0: [0.5, 0.95, 10], 1: [0.25, 0.7, 10], 2: [0.25, 0.7, 10], 3: [0.5, 0.95, 10], 4: [0.25, 0.7, 10], 5: [0.5, 0.95, 10], 6: [0.5, 0.95, 10], 7: [0.5, 0.95, 10], } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int overlap_ranges = np.zeros([3, 3, len(current_classes)]) for i, curcls in enumerate(current_classes): overlap_ranges[:, :, i] = np.array( class_to_range[curcls])[:, np.newaxis] result = '' # check whether alpha is valid compute_aos = False for anno in dt_annos: if anno['alpha'].shape[0] != 0: if anno['alpha'][0] != -10: compute_aos = True break mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval( gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos, z_axis=z_axis, z_center=z_center) for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] o_range = np.array(class_to_range[curcls])[[0, 2, 1]] o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) result += print_str((f"{class_to_name[curcls]} " "coco AP@{:.2f}:{:.2f}:{:.2f}:".format(*o_range))) result += print_str((f"bbox AP:{mAPbbox[j, 0]:.2f}, " f"{mAPbbox[j, 1]:.2f}, " f"{mAPbbox[j, 2]:.2f}")) result += print_str((f"bev AP:{mAPbev[j, 0]:.2f}, " f"{mAPbev[j, 1]:.2f}, " f"{mAPbev[j, 2]:.2f}")) result += print_str((f"3d AP:{mAP3d[j, 0]:.2f}, " f"{mAP3d[j, 1]:.2f}, " f"{mAP3d[j, 2]:.2f}")) if compute_aos: result += print_str((f"aos AP:{mAPaos[j, 0]:.2f}, " f"{mAPaos[j, 1]:.2f}, " f"{mAPaos[j, 2]:.2f}")) return result ================================================ FILE: disparity/eval/kitti-object-eval-python/eval.sh ================================================ #!/bin/bash echo $1 if [ ! -n "$2" ] ; then class="0" else class=$2 fi echo $class python3 evaluate.py evaluate \ --label_path=/mnt/home/ylchen/ylchen/dataset/KITTI_DATASET/kitti_detection/training/label_2/ \ --result_path=$1 \ --current_class=$class --coco=False ================================================ FILE: disparity/eval/kitti-object-eval-python/eval_dist.sh ================================================ #!/bin/bash echo $1 if [ ! -n "$2" ] ; then class="0" else class=$2 fi echo $class for i in $(seq 0 5 45) do echo "eval $i,$(($i+5)) meters" python3.6 evaluate.py evaluate \ --label_path=/home/yilunchen/data/kitti/training/label_2/ \ --result_path=$1 \ --current_class=$class --coco=False \ --eval_dist=$i,$(($i+5)) done ================================================ FILE: disparity/eval/kitti-object-eval-python/evaluate.py ================================================ import time import fire import kitti_common as kitti from eval import get_official_eval_result, get_coco_eval_result def _read_imageset_file(path): with open(path, 'r') as f: lines = f.readlines() return [int(line) for line in lines] def evaluate(label_path, result_path, current_class=0, coco=False, score_thresh=-1, eval_dist=None): dt_annos, image_ids = kitti.get_label_annos(result_path, return_image_ids=True, eval_dist=eval_dist) print('Eval {} images'.format(len(dt_annos))) if score_thresh > 0: dt_annos = kitti.filter_annos_low_score(dt_annos, score_thresh) #val_image_ids = _read_imageset_file(label_split_file) gt_annos = kitti.get_label_annos(label_path, image_ids, eval_dist=eval_dist) if coco: print(get_coco_eval_result(gt_annos, dt_annos, current_class)) else: print(get_official_eval_result(gt_annos, dt_annos, current_class)) if __name__ == '__main__': fire.Fire() ================================================ FILE: disparity/eval/kitti-object-eval-python/kitti_common.py ================================================ import concurrent.futures as futures import os import pathlib import re from collections import OrderedDict import numpy as np from skimage import io def get_image_index_str(img_idx): return "{:06d}".format(img_idx) def get_kitti_info_path(idx, prefix, info_type='image_2', file_tail='.png', training=True, relative_path=True): img_idx_str = get_image_index_str(idx) img_idx_str += file_tail prefix = pathlib.Path(prefix) if training: file_path = pathlib.Path('training') / info_type / img_idx_str else: file_path = pathlib.Path('testing') / info_type / img_idx_str if not (prefix / file_path).exists(): raise ValueError("file not exist: {}".format(file_path)) if relative_path: return str(file_path) else: return str(prefix / file_path) def get_image_path(idx, prefix, training=True, relative_path=True): return get_kitti_info_path(idx, prefix, 'image_2', '.png', training, relative_path) def get_label_path(idx, prefix, training=True, relative_path=True): return get_kitti_info_path(idx, prefix, 'label_2', '.txt', training, relative_path) def get_velodyne_path(idx, prefix, training=True, relative_path=True): return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training, relative_path) def get_calib_path(idx, prefix, training=True, relative_path=True): return get_kitti_info_path(idx, prefix, 'calib', '.txt', training, relative_path) def _extend_matrix(mat): mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0) return mat def get_kitti_image_info(path, training=True, label_info=True, velodyne=False, calib=False, image_ids=7481, extend_matrix=True, num_worker=8, relative_path=True, with_imageshape=True): # image_infos = [] root_path = pathlib.Path(path) if not isinstance(image_ids, list): image_ids = list(range(image_ids)) def map_func(idx): image_info = {'image_idx': idx} annotations = None if velodyne: image_info['velodyne_path'] = get_velodyne_path( idx, path, training, relative_path) image_info['img_path'] = get_image_path(idx, path, training, relative_path) if with_imageshape: img_path = image_info['img_path'] if relative_path: img_path = str(root_path / img_path) image_info['img_shape'] = np.array( io.imread(img_path).shape[:2], dtype=np.int32) if label_info: label_path = get_label_path(idx, path, training, relative_path) if relative_path: label_path = str(root_path / label_path) annotations = get_label_anno(label_path) if calib: calib_path = get_calib_path( idx, path, training, relative_path=False) with open(calib_path, 'r') as f: lines = f.readlines() P0 = np.array( [float(info) for info in lines[0].split(' ')[1:13]]).reshape( [3, 4]) P1 = np.array( [float(info) for info in lines[1].split(' ')[1:13]]).reshape( [3, 4]) P2 = np.array( [float(info) for info in lines[2].split(' ')[1:13]]).reshape( [3, 4]) P3 = np.array( [float(info) for info in lines[3].split(' ')[1:13]]).reshape( [3, 4]) if extend_matrix: P0 = _extend_matrix(P0) P1 = _extend_matrix(P1) P2 = _extend_matrix(P2) P3 = _extend_matrix(P3) image_info['calib/P0'] = P0 image_info['calib/P1'] = P1 image_info['calib/P2'] = P2 image_info['calib/P3'] = P3 R0_rect = np.array([ float(info) for info in lines[4].split(' ')[1:10] ]).reshape([3, 3]) if extend_matrix: rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype) rect_4x4[3, 3] = 1. rect_4x4[:3, :3] = R0_rect else: rect_4x4 = R0_rect image_info['calib/R0_rect'] = rect_4x4 Tr_velo_to_cam = np.array([ float(info) for info in lines[5].split(' ')[1:13] ]).reshape([3, 4]) Tr_imu_to_velo = np.array([ float(info) for info in lines[6].split(' ')[1:13] ]).reshape([3, 4]) if extend_matrix: Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam) Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo) image_info['calib/Tr_velo_to_cam'] = Tr_velo_to_cam image_info['calib/Tr_imu_to_velo'] = Tr_imu_to_velo if annotations is not None: image_info['annos'] = annotations add_difficulty_to_annos(image_info) return image_info with futures.ThreadPoolExecutor(num_worker) as executor: image_infos = executor.map(map_func, image_ids) return list(image_infos) def filter_kitti_anno(image_anno, used_classes, used_difficulty=None, dontcare_iou=None): if not isinstance(used_classes, (list, tuple)): used_classes = [used_classes] img_filtered_annotations = {} relevant_annotation_indices = [ i for i, x in enumerate(image_anno['name']) if x in used_classes ] for key in image_anno.keys(): img_filtered_annotations[key] = ( image_anno[key][relevant_annotation_indices]) if used_difficulty is not None: relevant_annotation_indices = [ i for i, x in enumerate(img_filtered_annotations['difficulty']) if x in used_difficulty ] for key in image_anno.keys(): img_filtered_annotations[key] = ( img_filtered_annotations[key][relevant_annotation_indices]) if 'DontCare' in used_classes and dontcare_iou is not None: dont_care_indices = [ i for i, x in enumerate(img_filtered_annotations['name']) if x == 'DontCare' ] # bounding box format [y_min, x_min, y_max, x_max] all_boxes = img_filtered_annotations['bbox'] ious = iou(all_boxes, all_boxes[dont_care_indices]) # Remove all bounding boxes that overlap with a dontcare region. if ious.size > 0: boxes_to_remove = np.amax(ious, axis=1) > dontcare_iou for key in image_anno.keys(): img_filtered_annotations[key] = (img_filtered_annotations[key][ np.logical_not(boxes_to_remove)]) return img_filtered_annotations def filter_annos_low_score(image_annos, thresh): new_image_annos = [] for anno in image_annos: img_filtered_annotations = {} relevant_annotation_indices = [ i for i, s in enumerate(anno['score']) if s >= thresh ] for key in anno.keys(): img_filtered_annotations[key] = ( anno[key][relevant_annotation_indices]) new_image_annos.append(img_filtered_annotations) return new_image_annos def kitti_result_line(result_dict, precision=4): prec_float = "{" + ":.{}f".format(precision) + "}" res_line = [] all_field_default = OrderedDict([ ('name', None), ('truncated', -1), ('occluded', -1), ('alpha', -10), ('bbox', None), ('dimensions', [-1, -1, -1]), ('location', [-1000, -1000, -1000]), ('rotation_y', -10), ('score', None), ]) res_dict = [(key, None) for key, val in all_field_default.items()] res_dict = OrderedDict(res_dict) for key, val in result_dict.items(): if all_field_default[key] is None and val is None: raise ValueError("you must specify a value for {}".format(key)) res_dict[key] = val for key, val in res_dict.items(): if key == 'name': res_line.append(val) elif key in ['truncated', 'alpha', 'rotation_y', 'score']: if val is None: res_line.append(str(all_field_default[key])) else: res_line.append(prec_float.format(val)) elif key == 'occluded': if val is None: res_line.append(str(all_field_default[key])) else: res_line.append('{}'.format(val)) elif key in ['bbox', 'dimensions', 'location']: if val is None: res_line += [str(v) for v in all_field_default[key]] else: res_line += [prec_float.format(v) for v in val] else: raise ValueError("unknown key. supported key:{}".format( res_dict.keys())) return ' '.join(res_line) def add_difficulty_to_annos(info): min_height = [40, 25, 25] # minimum height for evaluated groundtruth/detections max_occlusion = [ 0, 1, 2 ] # maximum occlusion level of the groundtruth used for evaluation max_trunc = [ 0.15, 0.3, 0.5 ] # maximum truncation level of the groundtruth used for evaluation annos = info['annos'] dims = annos['dimensions'] # lhw format bbox = annos['bbox'] height = bbox[:, 3] - bbox[:, 1] occlusion = annos['occluded'] truncation = annos['truncated'] diff = [] easy_mask = np.ones((len(dims), ), dtype=np.bool) moderate_mask = np.ones((len(dims), ), dtype=np.bool) hard_mask = np.ones((len(dims), ), dtype=np.bool) i = 0 for h, o, t in zip(height, occlusion, truncation): if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]: easy_mask[i] = False if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]: moderate_mask[i] = False if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]: hard_mask[i] = False i += 1 is_easy = easy_mask is_moderate = np.logical_xor(easy_mask, moderate_mask) is_hard = np.logical_xor(hard_mask, moderate_mask) for i in range(len(dims)): if is_easy[i]: diff.append(0) elif is_moderate[i]: diff.append(1) elif is_hard[i]: diff.append(2) else: diff.append(-1) annos["difficulty"] = np.array(diff, np.int32) return diff def get_label_anno(label_path, eval_dist=None): annotations = {} annotations.update({ 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [] }) with open(label_path, 'r') as f: lines = f.readlines() # if len(lines) == 0 or len(lines[0]) < 15: # content = [] # else: content = [line.strip().split(' ') for line in lines] if eval_dist is not None: content = [x for x in content if float(x[13]) >= eval_dist[0] and float(x[13]) < eval_dist[1]] annotations['name'] = np.array([x[0] for x in content]) annotations['truncated'] = np.array([float(x[1]) for x in content]) annotations['occluded'] = np.array([int(x[2]) for x in content]) annotations['alpha'] = np.array([float(x[3]) for x in content]) annotations['bbox'] = np.array( [[float(info) for info in x[4:8]] for x in content]).reshape(-1, 4) # dimensions will convert hwl format to standard lhw(camera) format. annotations['dimensions'] = np.array( [[float(info) for info in x[8:11]] for x in content]).reshape( -1, 3)[:, [2, 0, 1]] annotations['location'] = np.array( [[float(info) for info in x[11:14]] for x in content]).reshape(-1, 3) annotations['rotation_y'] = np.array( [float(x[14]) for x in content]).reshape(-1) if len(content) != 0 and len(content[0]) == 16: # have score annotations['score'] = np.array([float(x[15]) for x in content]) else: annotations['score'] = np.zeros([len(annotations['bbox'])]) return annotations def get_label_annos(label_folder, image_ids=None, return_image_ids=False, eval_dist=None): if image_ids is None: filepaths = pathlib.Path(label_folder).glob('*.txt') prog = re.compile(r'^\d{6}.txt$') filepaths = filter(lambda f: prog.match(f.name), filepaths) image_ids = [int(p.stem) for p in filepaths] image_ids = sorted(image_ids) if not isinstance(image_ids, list): image_ids = list(range(image_ids)) annos = [] label_folder = pathlib.Path(label_folder) for idx in image_ids: image_idx = get_image_index_str(idx) label_filename = label_folder / (image_idx + '.txt') annos.append(get_label_anno(label_filename, eval_dist=eval_dist)) if return_image_ids: return annos, image_ids return annos def area(boxes, add1=False): """Computes area of boxes. Args: boxes: Numpy array with shape [N, 4] holding N boxes Returns: a numpy array with shape [N*1] representing box areas """ if add1: return (boxes[:, 2] - boxes[:, 0] + 1.0) * ( boxes[:, 3] - boxes[:, 1] + 1.0) else: return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) def intersection(boxes1, boxes2, add1=False): """Compute pairwise intersection areas between boxes. Args: boxes1: a numpy array with shape [N, 4] holding N boxes boxes2: a numpy array with shape [M, 4] holding M boxes Returns: a numpy array with shape [N*M] representing pairwise intersection area """ [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) if add1: all_pairs_min_ymax += 1.0 intersect_heights = np.maximum( np.zeros(all_pairs_max_ymin.shape), all_pairs_min_ymax - all_pairs_max_ymin) all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) if add1: all_pairs_min_xmax += 1.0 intersect_widths = np.maximum( np.zeros(all_pairs_max_xmin.shape), all_pairs_min_xmax - all_pairs_max_xmin) return intersect_heights * intersect_widths def iou(boxes1, boxes2, add1=False): """Computes pairwise intersection-over-union between box collections. Args: boxes1: a numpy array with shape [N, 4] holding N boxes. boxes2: a numpy array with shape [M, 4] holding N boxes. Returns: a numpy array with shape [N, M] representing pairwise iou scores. """ intersect = intersection(boxes1, boxes2, add1) area1 = area(boxes1, add1) area2 = area(boxes2, add1) union = np.expand_dims( area1, axis=1) + np.expand_dims( area2, axis=0) - intersect return intersect / union ================================================ FILE: disparity/eval/kitti-object-eval-python/rotate_iou.py ================================================ ##################### # Based on https://github.com/hongzhenwang/RRPN-revise # Licensed under The MIT License # Author: yanyan, scrin@foxmail.com ##################### import math import numba import numpy as np from numba import cuda @numba.jit(nopython=True) def div_up(m, n): return m // n + (m % n > 0) @cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) def trangle_area(a, b, c): return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 @cuda.jit('(float32[:], int32)', device=True, inline=True) def area(int_pts, num_of_inter): area_val = 0.0 for i in range(num_of_inter - 2): area_val += abs( trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4], int_pts[2 * i + 4:2 * i + 6])) return area_val @cuda.jit('(float32[:], int32)', device=True, inline=True) def sort_vertex_in_convex_polygon(int_pts, num_of_inter): if num_of_inter > 0: center = cuda.local.array((2, ), dtype=numba.float32) center[:] = 0.0 for i in range(num_of_inter): center[0] += int_pts[2 * i] center[1] += int_pts[2 * i + 1] center[0] /= num_of_inter center[1] /= num_of_inter v = cuda.local.array((2, ), dtype=numba.float32) vs = cuda.local.array((16, ), dtype=numba.float32) for i in range(num_of_inter): v[0] = int_pts[2 * i] - center[0] v[1] = int_pts[2 * i + 1] - center[1] d = math.sqrt(v[0] * v[0] + v[1] * v[1]) v[0] = v[0] / d v[1] = v[1] / d if v[1] < 0: v[0] = -2 - v[0] vs[i] = v[0] j = 0 temp = 0 for i in range(1, num_of_inter): if vs[i - 1] > vs[i]: temp = vs[i] tx = int_pts[2 * i] ty = int_pts[2 * i + 1] j = i while j > 0 and vs[j - 1] > temp: vs[j] = vs[j - 1] int_pts[j * 2] = int_pts[j * 2 - 2] int_pts[j * 2 + 1] = int_pts[j * 2 - 1] j -= 1 vs[j] = temp int_pts[j * 2] = tx int_pts[j * 2 + 1] = ty @cuda.jit( '(float32[:], float32[:], int32, int32, float32[:])', device=True, inline=True) def line_segment_intersection(pts1, pts2, i, j, temp_pts): A = cuda.local.array((2, ), dtype=numba.float32) B = cuda.local.array((2, ), dtype=numba.float32) C = cuda.local.array((2, ), dtype=numba.float32) D = cuda.local.array((2, ), dtype=numba.float32) A[0] = pts1[2 * i] A[1] = pts1[2 * i + 1] B[0] = pts1[2 * ((i + 1) % 4)] B[1] = pts1[2 * ((i + 1) % 4) + 1] C[0] = pts2[2 * j] C[1] = pts2[2 * j + 1] D[0] = pts2[2 * ((j + 1) % 4)] D[1] = pts2[2 * ((j + 1) % 4) + 1] BA0 = B[0] - A[0] BA1 = B[1] - A[1] DA0 = D[0] - A[0] CA0 = C[0] - A[0] DA1 = D[1] - A[1] CA1 = C[1] - A[1] acd = DA1 * CA0 > CA1 * DA0 bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) if acd != bcd: abc = CA1 * BA0 > BA1 * CA0 abd = DA1 * BA0 > BA1 * DA0 if abc != abd: DC0 = D[0] - C[0] DC1 = D[1] - C[1] ABBA = A[0] * B[1] - B[0] * A[1] CDDC = C[0] * D[1] - D[0] * C[1] DH = BA1 * DC0 - BA0 * DC1 Dx = ABBA * DC0 - BA0 * CDDC Dy = ABBA * DC1 - BA1 * CDDC temp_pts[0] = Dx / DH temp_pts[1] = Dy / DH return True return False @cuda.jit( '(float32[:], float32[:], int32, int32, float32[:])', device=True, inline=True) def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): a = cuda.local.array((2, ), dtype=numba.float32) b = cuda.local.array((2, ), dtype=numba.float32) c = cuda.local.array((2, ), dtype=numba.float32) d = cuda.local.array((2, ), dtype=numba.float32) a[0] = pts1[2 * i] a[1] = pts1[2 * i + 1] b[0] = pts1[2 * ((i + 1) % 4)] b[1] = pts1[2 * ((i + 1) % 4) + 1] c[0] = pts2[2 * j] c[1] = pts2[2 * j + 1] d[0] = pts2[2 * ((j + 1) % 4)] d[1] = pts2[2 * ((j + 1) % 4) + 1] area_abc = trangle_area(a, b, c) area_abd = trangle_area(a, b, d) if area_abc * area_abd >= 0: return False area_cda = trangle_area(c, d, a) area_cdb = area_cda + area_abc - area_abd if area_cda * area_cdb >= 0: return False t = area_cda / (area_abd - area_abc) dx = t * (b[0] - a[0]) dy = t * (b[1] - a[1]) temp_pts[0] = a[0] + dx temp_pts[1] = a[1] + dy return True @cuda.jit('(float32, float32, float32[:])', device=True, inline=True) def point_in_quadrilateral(pt_x, pt_y, corners): ab0 = corners[2] - corners[0] ab1 = corners[3] - corners[1] ad0 = corners[6] - corners[0] ad1 = corners[7] - corners[1] ap0 = pt_x - corners[0] ap1 = pt_y - corners[1] abab = ab0 * ab0 + ab1 * ab1 abap = ab0 * ap0 + ab1 * ap1 adad = ad0 * ad0 + ad1 * ad1 adap = ad0 * ap0 + ad1 * ap1 return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 @cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) def quadrilateral_intersection(pts1, pts2, int_pts): num_of_inter = 0 for i in range(4): if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): int_pts[num_of_inter * 2] = pts1[2 * i] int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] num_of_inter += 1 if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): int_pts[num_of_inter * 2] = pts2[2 * i] int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] num_of_inter += 1 temp_pts = cuda.local.array((2, ), dtype=numba.float32) for i in range(4): for j in range(4): has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) if has_pts: int_pts[num_of_inter * 2] = temp_pts[0] int_pts[num_of_inter * 2 + 1] = temp_pts[1] num_of_inter += 1 return num_of_inter @cuda.jit('(float32[:], float32[:])', device=True, inline=True) def rbbox_to_corners(corners, rbbox): # generate clockwise corners and rotate it clockwise angle = rbbox[4] a_cos = math.cos(angle) a_sin = math.sin(angle) center_x = rbbox[0] center_y = rbbox[1] x_d = rbbox[2] y_d = rbbox[3] corners_x = cuda.local.array((4, ), dtype=numba.float32) corners_y = cuda.local.array((4, ), dtype=numba.float32) corners_x[0] = -x_d / 2 corners_x[1] = -x_d / 2 corners_x[2] = x_d / 2 corners_x[3] = x_d / 2 corners_y[0] = -y_d / 2 corners_y[1] = y_d / 2 corners_y[2] = y_d / 2 corners_y[3] = -y_d / 2 for i in range(4): corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y @cuda.jit('(float32[:], float32[:])', device=True, inline=True) def inter(rbbox1, rbbox2): corners1 = cuda.local.array((8, ), dtype=numba.float32) corners2 = cuda.local.array((8, ), dtype=numba.float32) intersection_corners = cuda.local.array((16, ), dtype=numba.float32) rbbox_to_corners(corners1, rbbox1) rbbox_to_corners(corners2, rbbox2) num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners) sort_vertex_in_convex_polygon(intersection_corners, num_intersection) # print(intersection_corners.reshape([-1, 2])[:num_intersection]) return area(intersection_corners, num_intersection) @cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True) def devRotateIoUEval(rbox1, rbox2, criterion=-1): area1 = rbox1[2] * rbox1[3] area2 = rbox2[2] * rbox2[3] area_inter = inter(rbox1, rbox2) if criterion == -1: return area_inter / (area1 + area2 - area_inter) elif criterion == 0: return area_inter / area1 elif criterion == 1: return area_inter / area2 elif criterion == 2: return area_inter # (gt dt) @cuda.jit('(int64, int64, float32[:], float32[:], float32[:], int32)', fastmath=False) def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1): threadsPerBlock = 8 * 8 row_start = cuda.blockIdx.x col_start = cuda.blockIdx.y tx = cuda.threadIdx.x row_size = min(N - row_start * threadsPerBlock, threadsPerBlock) col_size = min(K - col_start * threadsPerBlock, threadsPerBlock) block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) dev_query_box_idx = threadsPerBlock * col_start + tx dev_box_idx = threadsPerBlock * row_start + tx if (tx < col_size): block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] if (tx < row_size): block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] cuda.syncthreads() tmp_boxes = cuda.local.array(shape=(5,), dtype=numba.float32) tmp_qboxes = cuda.local.array(shape=(5,), dtype=numba.float32) if tx < row_size: for i in range(col_size): offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i tmp_boxes[0] = block_boxes[tx * 5] tmp_boxes[1] = block_boxes[tx * 5 + 1] tmp_boxes[2] = block_boxes[tx * 5 + 2] tmp_boxes[3] = block_boxes[tx * 5 + 3] tmp_boxes[4] = block_boxes[tx * 5 + 4] tmp_qboxes[0] = block_qboxes[i * 5] tmp_qboxes[1] = block_qboxes[i * 5 + 1] tmp_qboxes[2] = block_qboxes[i * 5 + 2] tmp_qboxes[3] = block_qboxes[i * 5 + 3] tmp_qboxes[4] = block_qboxes[i * 5 + 4] tmp_criterion = criterion if criterion == 3 or criterion == 4 or criterion == 5 or \ criterion == 9 or criterion == 10 or criterion == 11 or criterion == 12 or criterion == 18: tmp_criterion = -1 elif criterion == 6 or criterion == 7 or criterion == 8 or \ criterion == 13 or criterion == 14 or criterion == 15 or criterion == 16 or criterion == 19: tmp_criterion = 2 if criterion == 3 or criterion == 6: tmp_qboxes[0] = tmp_boxes[0] elif criterion == 5 or criterion == 8: tmp_qboxes[1] = tmp_boxes[1] elif criterion == 9 or criterion == 13: tmp_qboxes[2] = tmp_boxes[2] elif criterion == 11 or criterion == 15: tmp_qboxes[3] = tmp_boxes[3] elif criterion == 12 or criterion == 16: tmp_qboxes[4] = tmp_boxes[4] elif criterion == 18 or criterion == 19: # it's suppose not to fix (x, y) since bev overlap between all boxes is 1 # tmp_qboxes[0] = tmp_boxes[0]+1e-3 # tmp_qboxes[1] = tmp_boxes[1]+1e-3 tmp_qboxes[2] = tmp_boxes[2] tmp_qboxes[3] = tmp_boxes[3] tmp_qboxes[4] = tmp_boxes[4] dev_iou[offset] = devRotateIoUEval(tmp_boxes, tmp_qboxes, tmp_criterion) def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): """rotated box iou running in gpu. 500x faster than cpu version (take 5ms in one example with numba.cuda code). convert from [this project]( https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). Args: boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, angles(clockwise when positive) query_boxes (float tensor: [K, 5]): [description] device_id (int, optional): Defaults to 0. [description] Returns: [type]: [description] """ box_dtype = boxes.dtype boxes = boxes.astype(np.float32) query_boxes = query_boxes.astype(np.float32) N = boxes.shape[0] K = query_boxes.shape[0] iou = np.zeros((N, K), dtype=np.float32) if N == 0 or K == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream]( N, K, boxes_dev, query_boxes_dev, iou_dev, criterion) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes.dtype) ================================================ FILE: disparity/layers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from .batch_norm import FrozenBatchNorm2d from .misc import Conv2d from .misc import ConvTranspose2d from .misc import BatchNorm2d from .misc import interpolate from .nms import nms from .roi_align import ROIAlign from .roi_align import roi_align from .roi_pool import ROIPool from .roi_pool import roi_pool from .smooth_l1_loss import smooth_l1_loss, l1_loss, l2_loss, ordinal_loss, dorn_encode, dorn_decode, bce_loss from .sigmoid_focal_loss import SigmoidFocalLoss from .iou_loss import IOULoss from .scale import Scale, ScaleShift from .build_cost_volume import BuildCostVolume __all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss", "IOULoss", "Scale", "BuildCostVolume"] ================================================ FILE: disparity/layers/_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import glob import os.path import torch try: from torch.utils.cpp_extension import load as load_ext from torch.utils.cpp_extension import CUDA_HOME except ImportError: raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") def _load_C_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(this_dir) this_dir = os.path.join(this_dir, "csrc") main_file = glob.glob(os.path.join(this_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) source = main_file + source_cpu extra_cflags = [] if torch.cuda.is_available() and CUDA_HOME is not None: source.extend(source_cuda) extra_cflags = ["-DWITH_CUDA"] source = [os.path.join(this_dir, s) for s in source] extra_include_paths = [this_dir] return load_ext( "torchvision", source, extra_cflags=extra_cflags, extra_include_paths=extra_include_paths, ) _C = _load_C_extensions() ================================================ FILE: disparity/layers/batch_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn class FrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed """ def __init__(self, n): super(FrozenBatchNorm2d, self).__init__() self.register_buffer("weight", torch.ones(n)) self.register_buffer("bias", torch.zeros(n)) self.register_buffer("running_mean", torch.zeros(n)) self.register_buffer("running_var", torch.ones(n)) def forward(self, x): scale = self.weight * self.running_var.rsqrt() bias = self.bias - self.running_mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return x * scale + bias ================================================ FILE: disparity/layers/build_cost_volume.py ================================================ import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from dsgn import _C class _BuildCostVolume(Function): @staticmethod def forward(ctx, left, right, shift): ctx.save_for_backward(shift) assert torch.all(shift >= 0.) output = _C.build_cost_volume_forward( left, right, shift ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): shift, = ctx.saved_tensors grad_left, grad_right = _C.build_cost_volume_backward( grad_output, shift ) return grad_left, grad_right, None build_cost_volume = _BuildCostVolume.apply class BuildCostVolume(nn.Module): def __init__(self): super(BuildCostVolume, self).__init__() def forward(self, left, right, shift): return build_cost_volume( left, right, shift ) def __repr__(self): tmpstr = self.__class__.__name__ return tmpstr ================================================ FILE: disparity/layers/iou_loss.py ================================================ import torch from torch import nn class IOULoss(nn.Module): def forward(self, pred, target, weight=None): pred_left = pred[:, 0] pred_top = pred[:, 1] pred_right = pred[:, 2] pred_bottom = pred[:, 3] target_left = target[:, 0] target_top = target[:, 1] target_right = target[:, 2] target_bottom = target[:, 3] target_aera = (target_left + target_right) * \ (target_top + target_bottom) pred_aera = (pred_left + pred_right) * \ (pred_top + pred_bottom) w_intersect = torch.min(pred_left, target_left) + \ torch.min(pred_right, target_right) h_intersect = torch.min(pred_bottom, target_bottom) + \ torch.min(pred_top, target_top) area_intersect = w_intersect * h_intersect area_union = target_aera + pred_aera - area_intersect losses = -torch.log((area_intersect + 1.0) / (area_union + 1.0)) if weight is not None and weight.sum() > 0: return (losses * weight).sum() / weight.sum() else: assert losses.numel() != 0 return losses.mean() ================================================ FILE: disparity/layers/misc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. """ helper class that supports empty tensors on some nn functions. Ideally, add support directly in PyTorch to empty tensors in those functions. This can be removed once https://github.com/pytorch/pytorch/issues/12013 is implemented """ import math import torch from torch.nn.modules.utils import _ntuple class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class Conv2d(torch.nn.Conv2d): def forward(self, x): if x.numel() > 0: return super(Conv2d, self).forward(x) # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class ConvTranspose2d(torch.nn.ConvTranspose2d): def forward(self, x): if x.numel() > 0: return super(ConvTranspose2d, self).forward(x) # get output shape output_shape = [ (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op for i, p, di, k, d, op in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride, self.output_padding, ) ] output_shape = [x.shape[0], self.bias.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) class BatchNorm2d(torch.nn.BatchNorm2d): def forward(self, x): if x.numel() > 0: return super(BatchNorm2d, self).forward(x) # get output shape output_shape = x.shape return _NewEmptyTensorOp.apply(x, output_shape) def interpolate( input, size=None, scale_factor=None, mode="nearest", align_corners=None ): if input.numel() > 0: return torch.nn.functional.interpolate( input, size, scale_factor, mode, align_corners ) def _check_size_scale_factor(dim): if size is None and scale_factor is None: raise ValueError("either size or scale_factor should be defined") if size is not None and scale_factor is not None: raise ValueError("only one of size or scale_factor should be defined") if ( scale_factor is not None and isinstance(scale_factor, tuple) and len(scale_factor) != dim ): raise ValueError( "scale_factor shape must match input shape. " "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) ) def _output_size(dim): _check_size_scale_factor(dim) if size is not None: return size scale_factors = _ntuple(dim)(scale_factor) # math.floor might return float in py2.7 return [ int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) ] output_shape = tuple(_output_size(2)) output_shape = input.shape[:-2] + output_shape return _NewEmptyTensorOp.apply(input, output_shape) ================================================ FILE: disparity/layers/nms.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # from ._utils import _C from dsgn import _C nms = _C.nms # nms.__doc__ = """ # This function performs Non-maximum suppresion""" ================================================ FILE: disparity/layers/roi_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from dsgn import _C class _ROIAlign(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): ctx.save_for_backward(roi) ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio ctx.input_shape = input.size() output = _C.roi_align_forward( input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale sampling_ratio = ctx.sampling_ratio bs, ch, h, w = ctx.input_shape grad_input = _C.roi_align_backward( grad_output, rois, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, sampling_ratio, ) return grad_input, None, None, None, None roi_align = _ROIAlign.apply class ROIAlign(nn.Module): def __init__(self, output_size, spatial_scale, sampling_ratio): super(ROIAlign, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale self.sampling_ratio = sampling_ratio def forward(self, input, rois): return roi_align( input, rois, self.output_size, self.spatial_scale, self.sampling_ratio ) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) tmpstr += ")" return tmpstr ================================================ FILE: disparity/layers/roi_pool.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from dsgn import _C class _ROIPool(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale): ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.input_shape = input.size() output, argmax = _C.roi_pool_forward( input, roi, spatial_scale, output_size[0], output_size[1] ) ctx.save_for_backward(input, roi, argmax) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, rois, argmax = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale bs, ch, h, w = ctx.input_shape grad_input = _C.roi_pool_backward( grad_output, input, rois, argmax, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, ) return grad_input, None, None, None roi_pool = _ROIPool.apply class ROIPool(nn.Module): def __init__(self, output_size, spatial_scale): super(ROIPool, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale def forward(self, input, rois): return roi_pool(input, rois, self.output_size, self.spatial_scale) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ")" return tmpstr ================================================ FILE: disparity/layers/scale.py ================================================ import torch from torch import nn class Scale(nn.Module): def __init__(self, init_value=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.FloatTensor([init_value])) def forward(self, input): return input * self.scale class ScaleShift(nn.Module): def __init__(self, scale_value, shift_value, exp=False): super(ScaleShift, self).__init__() self.scale = nn.Parameter(torch.FloatTensor([scale_value])) self.shift = nn.Parameter(torch.FloatTensor([shift_value])) self.exp = exp def forward(self, input): if not self.exp: return input * self.scale + self.shift else: return torch.exp(input / 10.) * self.scale + self.shift ================================================ FILE: disparity/layers/sigmoid_focal_loss.py ================================================ import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from dsgn import _C # TODO: Use JIT to replace CUDA implementation in the future. class _SigmoidFocalLoss(Function): @staticmethod def forward(ctx, logits, targets, gamma, alpha): ctx.save_for_backward(logits, targets) num_classes = logits.shape[1] ctx.num_classes = num_classes ctx.gamma = gamma ctx.alpha = alpha losses = _C.sigmoid_focalloss_forward( logits, targets, num_classes, gamma, alpha ) return losses @staticmethod @once_differentiable def backward(ctx, d_loss): logits, targets = ctx.saved_tensors num_classes = ctx.num_classes gamma = ctx.gamma alpha = ctx.alpha d_loss = d_loss.contiguous() d_logits = _C.sigmoid_focalloss_backward( logits, targets, d_loss, num_classes, gamma, alpha ) return d_logits, None, None, None, None sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): num_classes = logits.shape[1] gamma = gamma[0] alpha = alpha[0] dtype = targets.dtype device = targets.device class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) t = targets.unsqueeze(1) p = torch.sigmoid(logits) term1 = (1 - p) ** gamma * torch.log(p) term2 = p ** gamma * torch.log(1 - p) return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) class SigmoidFocalLoss(nn.Module): def __init__(self, gamma, alpha): super(SigmoidFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha def forward(self, logits, targets, weights=None): device = logits.device if logits.is_cuda: loss_func = sigmoid_focal_loss_cuda else: loss_func = sigmoid_focal_loss_cpu loss = loss_func(logits, targets, self.gamma, self.alpha) if weights is not None: loss = loss * weights.reshape(-1, 1) return loss.sum() def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "gamma=" + str(self.gamma) tmpstr += ", alpha=" + str(self.alpha) tmpstr += ")" return tmpstr ================================================ FILE: disparity/layers/smooth_l1_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch import numpy as np # TODO maybe push this to nn? def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): """ very similar to the smooth_l1_loss from pytorch, but with the extra beta parameter """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum() def l1_loss(input, target, beta=1., sum_last_dim=False): n = torch.abs(input - target) loss = n * beta if sum_last_dim: loss = loss.sum(dim=-1) return loss.mean() def l2_loss(input, target, beta=1., sum_last_dim=False): diff = input - target n = diff * diff loss = n * beta if sum_last_dim: loss = loss.sum(dim=-1) return loss.mean() def ordinal_loss(input, target): N, C = input.shape ranges = torch.arange(C, dtype=torch.int32).cuda() mask = ranges[None, :] < target[:, None] loss = -(torch.sum(torch.log( input[mask] + 1e-6 )) \ + torch.sum(torch.log( 1. - input[1 - mask] + 1e-6 ))) loss = loss / N / C return loss def dorn_decode(cls, reg, alpha, beta): dorn_dim = cls.shape[1] depth_discretization = torch.sum((cls > 0.5), dim=1, keepdim=True) if reg is not None: depth_residual = torch.gather(reg, dim=1, index=depth_discretization) depth_continuity = depth_discretization.float() + 0.5 + depth_residual else: depth_continuity = depth_discretization.float() depth = alpha * (beta / alpha) ** (depth_continuity / dorn_dim) return depth def dorn_encode(depth, alpha, beta, dorn_dim): depth = dorn_dim * torch.log(depth / alpha + 1e-6) / np.log(beta / alpha + 1e-6) depth = depth.clamp(0, dorn_dim) return depth.int(), depth - depth.int().float() - 0.5 def bce_loss(score, target): loss = - (target * torch.log(score + 1e-6) + (1 - target) * torch.log( 1 - score + 1e-6)) return loss.mean() ================================================ FILE: disparity/models/ActiveStereoNet.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np import torch.backends.cudnn as cudnn def convbn(in_channel, out_channel, kernel_size, stride, pad, dilation): return nn.Sequential( nn.Conv2d( in_channel, out_channel, kernel_size=kernel_size, stride=stride, padding=dilation if dilation>1 else pad, dilation=dilation), nn.BatchNorm2d(out_channel)) def convbn_3d(in_channel, out_channel, kernel_size, stride, pad): return nn.Sequential( nn.Conv3d( in_channel, out_channel, kernel_size=kernel_size, padding=pad, stride=stride), nn.BatchNorm3d(out_channel)) class ConvolutionBlock(nn.Module): def __init__(self, in_channel, out_channel, stride, downsample, pad, dilation): super(ConvolutionBlock, self).__init__() self.conv1 = nn.Sequential( convbn(in_channel, out_channel, 3, stride, pad, dilation), nn.LeakyReLU(negative_slope=0.2, inplace=True)) self.downsample = downsample self.stride = stride def forward(self, x): out = self.conv1(x) # out = x + out return out class ResNetBlock(nn.Module): def __init__(self, in_channel, out_channel, stride, downsample, pad, dilation): super(ResNetBlock, self).__init__() self.conv1 = nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=1, padding=1) self.downsample = downsample self.stride = stride def forward(self, x): out = self.conv1(x) out = x + out return out class Siamese_Tower(nn.Module): def __init__(self): super(Siamese_Tower, self).__init__() self.conv_begin = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1) self.residual_blocks = nn.ModuleList() for _ in range(3): self.residual_blocks.append( ResNetBlock( 32, 32, stride=1, downsample=None, pad=1, dilation=1)) self.downsample = nn.ModuleList() for _ in range(3): self.downsample.append( ConvolutionBlock( 32, 32, stride=2, downsample=None, pad=1, dilation=1)) self.conv_end = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1) def forward(self, rgb_img): output = rgb_img output = self.conv_begin(output) for block in self.residual_blocks: output = block(output) for block in self.downsample: output = block(output) output = self.conv_end (output) return output class Disparity_Refinement(nn.Module): #return: full_res disparity def __init__(self, in_channel): super(Disparity_Refinement, self).__init__() self.conv2d_feature_img = nn.Sequential( convbn(in_channel, 16, kernel_size=3, stride=1, pad=1, dilation=1), nn.LeakyReLU(negative_slope=0.2, inplace=True)) self.residual_astrous_blocks_img = nn.ModuleList() astrous_list = [1, 2] for di in astrous_list: self.residual_astrous_blocks_img.append( ResNetBlock( 16, 16, stride=1, downsample=None, pad=1, dilation=di)) self.conv2d_feature_disp = nn.Sequential( convbn(in_channel, 16, kernel_size=3, stride=1, pad=1, dilation=1), nn.LeakyReLU(negative_slope=0.2, inplace=True)) self.residual_astrous_blocks_disp = nn.ModuleList() astrous_list = [1, 2] for di in astrous_list: self.residual_astrous_blocks_disp.append( ResNetBlock( 16, 16, stride=1, downsample=None, pad=1, dilation=di)) self.residual_astrous_blocks_cated = nn.ModuleList() astrous_list = [4, 8, 1, 1] for di in astrous_list: self.residual_astrous_blocks_cated.append( ResNetBlock( 32, 32, stride=1, downsample=None, pad=1, dilation=di)) self.conv_end = nn.Conv2d(32, 1, kernel_size=3, stride=1, padding=1) def forward(self, low_disparity, corresponding_rgb): feature_disp = self.conv2d_feature_disp(low_disparity) feature_img = self.conv2d_feature_img(corresponding_rgb) feature_cated = torch.cat([feature_disp, feature_img], dim=1) Disparity_Residual = self.conv_end(feature_cated) return Disparity_Residual + low_disparity class Invalidation_Net(nn.Module): #return: full_res Invalidation def __init__(self): super(Invalidation_Net, self).__init__() self.residual_blocks1 = nn.ModuleList() for _ in range(5): self.residual_blocks1.append( ResNetBlock( 64, 64, stride=1, downsample=None, pad=1, dilation=1)) self.conv_end1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) self.conv_begin = ConvolutionBlock( 3, 32, stride=1, downsample=None, pad=1, dilation=1) self.residual_blocks2 = nn.ModuleList() for _ in range(4): self.residual_blocks2.append( ResNetBlock( 32, 32, stride=1, downsample=None, pad=1, dilation=1)) self.conv_end2 = nn.Conv2d(32, 1, kernel_size=3, stride=1, padding=1) def forward(self, left_tower, right_tower, input_img, full_res_disparity): output = torch.cat([left_tower, right_tower], dim=1) for block in self.residual_blocks1: output = block(output) Low_Res_invalidation_small = self.conv_end1(output) pred = Low_Res_invalidation_small * input_img.size()[-1] / Low_Res_invalidation_small.size()[-1] Low_Res_invalidation = F.upsample( pred, size=input_img.size()[-2:], mode='bilinear', align_corners=False) output = torch.cat([input_img, Low_Res_invalidation,full_res_disparity], dim=1) output = self.conv_begin(output) for block in self.residual_blocks2: output = block(output) Invalidation_Residual = self.conv_end2(output) return Invalidation_Residual + Low_Res_invalidation class disparityregression(nn.Module): def __init__(self, maxdisp): super(disparityregression, self).__init__() self.disp = torch.FloatTensor( np.reshape(np.array(range(maxdisp)), [1, maxdisp, 1, 1])).cuda() def forward(self, x): disp = self.disp.repeat(x.size()[0], 1, x.size()[2], x.size()[3]) out = torch.sum(x * disp, 1) return out class Active_StereoNet(nn.Module): def __init__(self, maxdisp=144): super(Active_StereoNet, self).__init__() self.maxdisp = maxdisp self.Siamese_Tower = Siamese_Tower() self.filter = nn.ModuleList() for _ in range(4): self.filter.append( nn.Sequential( convbn_3d(32, 32, kernel_size=3, stride=1, pad=1), nn.LeakyReLU(negative_slope=0.2, inplace=True))) self.conv3d_alone = nn.Conv3d( 32, 1, kernel_size=3, stride=1, padding=1) self.Disparity_Refinement = Disparity_Refinement(in_channel=1) self.Invalidation_Net = Invalidation_Net() def forward(self, left, right): disp = (self.maxdisp + 1) // 8 refimg_feature = self.Siamese_Tower(left) targetimg_feature = self.Siamese_Tower(right) def calculate(refimg_feature, targetimg_feature, img, type): # matching cost = torch.FloatTensor(refimg_feature.size()[0], refimg_feature.size()[1], disp, refimg_feature.size()[2], refimg_feature.size()[3]).zero_().cuda() if type == 'left': for i in range(disp): if i > 0: cost[:, :, i, :, i:] = refimg_feature[ :, :, :, i:] - targetimg_feature[:, :, :, :-i] else: cost[:, :, i, :, :] = refimg_feature - targetimg_feature if type == 'right': for i in range(disp): if i > 0: cost[:, :, i, :, :-i] = refimg_feature[ :, :, :, :-i] - targetimg_feature[:, :, :, i:] else: cost[:, :, i, :, :] = refimg_feature - targetimg_feature cost = cost.contiguous() for f in self.filter: cost = f(cost) cost = self.conv3d_alone(cost) cost = torch.squeeze(cost, 1) pred = F.softmax(cost, dim=1) pred = disparityregression(disp)(pred) pred = pred * img.size()[-1] / pred.size()[-1] res_disparity = F.upsample( torch.unsqueeze(pred, dim=1), size=img.size()[-2:], mode='bilinear', align_corners=False) return res_disparity res_disparityL = calculate(refimg_feature, targetimg_feature, left, 'left') res_disparityR = calculate(targetimg_feature, refimg_feature, right, 'right') Full_res_disparityL = self.Disparity_Refinement(res_disparityL, left) Full_res_disparityR = self.Disparity_Refinement(res_disparityR, right) # Full_res_invalidation = self.Invalidation_Net(refimg_feature, targetimg_feature, left, Full_res_disparityL) return Full_res_disparityL, Full_res_disparityR, Full_res_disparityR # return Full_res_disparityL, Full_res_disparityL,Full_res_disparityL if __name__ == '__main__': model = Active_StereoNet().cuda() # model.eval() import time import datetime import torch # torch.backends.cudnn.benchmark = True input = torch.FloatTensor(1,1,720,1280).zero_().cuda() with torch.no_grad(): from thop import clever_format from thop import profile flops, params = profile(model, inputs=(input, input)) flops, params = clever_format([flops, params], "%.3f") print(flops, params) ================================================ FILE: disparity/models/__init__.py ================================================ from .stereonet import StereoNet from .hitnet import HitNet from .stereonet_disp import StereoNet as stereonet_disp ================================================ FILE: disparity/models/stereonet.py ================================================ from __future__ import print_function from .submodule import * import torch import torch.nn as nn import torch.utils.data from torch.autograd import Variable import torch.nn.functional as F import math from dsgn.utils.bounding_box import compute_corners, quan_to_angle, \ angle_to_quan, quan_to_rotation, compute_corners_sc from dsgn.layers import BuildCostVolume def project_rect_to_image(pts_3d_rect, P): n = pts_3d_rect.shape[0] ones = torch.ones((n,1)) if pts_3d_rect.is_cuda: ones = ones.cuda() pts_3d_rect = torch.cat([pts_3d_rect, ones], dim=1) pts_2d = torch.mm(pts_3d_rect, torch.transpose(P, 0, 1)) # nx3 pts_2d[:,0] /= pts_2d[:,2] pts_2d[:,1] /= pts_2d[:,2] return pts_2d[:,0:2] class StereoNet(nn.Module): def __init__(self, cfg=None): super(StereoNet, self).__init__() self.maxdisp = cfg.maxdisp self.downsample_disp = cfg.downsample_disp self.cfg = cfg self.num_classes = self.cfg.num_classes self.hg_rpn_conv3d = getattr(self.cfg, 'hg_rpn_conv3d', False) self.hg_rpn_conv = getattr(self.cfg, 'hg_rpn_conv', False) self.centerness4class = getattr(self.cfg, 'centerness4class', False) self.img_feature_attentionbydisp = getattr(self.cfg, 'img_feature_attentionbydisp', False) self.voxel_attentionbydisp = getattr(self.cfg, 'voxel_attentionbydisp', False) self.valid_classes = getattr(self.cfg, 'valid_classes', None) self.class4angles = getattr(self.cfg, 'class4angles', True) self.box_corner_parameters = getattr(self.cfg, 'box_corner_parameters', True) self.PlaneSweepVolume = getattr(self.cfg, 'PlaneSweepVolume', True) self.loss_disp = getattr(self.cfg, 'loss_disp', True) self.fix_centerness_bug = getattr(self.cfg, 'fix_centerness_bug', False) self.hg_firstconv = getattr(self.cfg, 'hg_firstconv', False) self.rpn3d_conv_kernel = getattr(self.cfg, 'rpn3d_conv_kernel', 3) if self.PlaneSweepVolume: self.build_cost = BuildCostVolume() self.anchor_angles = torch.as_tensor(self.cfg.ANCHOR_ANGLES) self.num_angles = self.cfg.num_angles self.feature_extraction = feature_extraction(cfg) res_dim = 64 if self.PlaneSweepVolume: if not self.hg_firstconv: self.dres0 = nn.Sequential(convbn_3d(res_dim, res_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True), convbn_3d(res_dim, res_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) self.dres1 = nn.Sequential(convbn_3d(res_dim, res_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True), convbn_3d(res_dim, res_dim, 3, 1, 1, gn=cfg.GN)) else: self.dres0 = hourglass(res_dim, gn=cfg.GN) self.hg_cv = self.cfg.hg_cv if self.hg_cv: self.dres2 = hourglass(res_dim, gn=cfg.GN) if self.loss_disp: self.classif1 = nn.Sequential(convbn_3d(res_dim, res_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True), nn.Conv3d(res_dim, 1, kernel_size=3, padding=1, stride=1, bias=False)) self.cat_disp = getattr(self.cfg, 'cat_disp', False) self.cat_img_feature = getattr(self.cfg, 'cat_img_feature', False) self.cat_right_img_feature = getattr(self.cfg, 'cat_right_img_feature', False) self.num_convs = getattr(self.cfg.RPN3D, 'NUM_CONVS', 4) self.num_3dconvs = getattr(self.cfg.RPN3D, 'NUM_3DCONVS', 1) assert self.num_3dconvs > 0 RPN3D_INPUT_DIM = 0 if self.PlaneSweepVolume: RPN3D_INPUT_DIM += res_dim if self.cat_disp: RPN3D_INPUT_DIM += 1 if self.cat_img_feature: RPN3D_INPUT_DIM += self.cfg.RPN_CONVDIM if self.cat_right_img_feature: RPN3D_INPUT_DIM += self.cfg.RPN_CONVDIM if self.cfg.RPN3D_ENABLE: conv3d_dim = getattr(self.cfg, 'conv3d_dim', 64) self.rpn3d_conv = nn.Sequential(convbn_3d(RPN3D_INPUT_DIM, conv3d_dim, self.rpn3d_conv_kernel, 1, 1 if self.rpn3d_conv_kernel == 3 else 0, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_3dconvs > 1: self.rpn_3dconv1 = nn.Sequential(convbn_3d(conv3d_dim, conv3d_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_3dconvs > 2: self.rpn_3dconv2 = nn.Sequential(convbn_3d(conv3d_dim, conv3d_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_3dconvs > 3: self.rpn_3dconv3 = nn.Sequential(convbn_3d(conv3d_dim, conv3d_dim, 3, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.hg_rpn_conv3d: self.hg_rpn3d_conv = hourglass(conv3d_dim, gn=cfg.GN) self.rpn3d_pool = torch.nn.AvgPool3d((1, 4, 1), stride=(1, 4, 1)) self.rpn3d_conv2 = nn.Sequential(convbn(conv3d_dim * 5, conv3d_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if not self.hg_rpn_conv: self.rpn3d_conv3 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) else: self.rpn3d_conv3 = hourglass2d(res_dim * 2, gn=cfg.GN) self.rpn3d_cls_convs = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) self.rpn3d_bbox_convs = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_convs > 1: self.rpn3d_cls_convs2 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) self.rpn3d_bbox_convs2 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_convs > 2: self.rpn3d_cls_convs3 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) self.rpn3d_bbox_convs3 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.num_convs > 3: self.rpn3d_cls_convs4 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) self.rpn3d_bbox_convs4 = nn.Sequential(convbn(res_dim * 2, res_dim * 2, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True)) if self.class4angles: self.bbox_cls = nn.Conv2d(res_dim * 2, self.num_angles * self.num_classes, kernel_size=3, padding=1, stride=1) else: self.bbox_cls = nn.Conv2d(res_dim * 2, self.num_classes, kernel_size=3, padding=1, stride=1) centerness_dim = 1 centerness_dim *= self.num_angles if self.centerness4class: centerness_dim *= self.num_classes self.bbox_centerness = nn.Conv2d(res_dim * 2, centerness_dim, kernel_size=3, padding=1, stride=1) self.each_angle_dim = 1 self.hwl_dim = 3 self.xyz_dim = 3 # dx,dy,dz dh,dw,dl, [s,c, cls]xnum_angles self.bbox_reg = nn.Conv2d(res_dim * 2, self.num_classes * (self.xyz_dim + self.hwl_dim + self.num_angles * self.each_angle_dim), kernel_size=3, padding=1, stride=1) self.anchor_size = torch.as_tensor([cfg.RPN3D.ANCHORS_HEIGHT, cfg.RPN3D.ANCHORS_WIDTH, cfg.RPN3D.ANCHORS_LENGTH]).transpose(1, 0) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.Conv3d): n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.BatchNorm3d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.Linear): m.bias.data.zero_() if self.cfg.RPN3D_ENABLE: torch.nn.init.normal_(self.bbox_cls.weight, std=0.1) torch.nn.init.constant_(self.bbox_cls.bias, 0) torch.nn.init.normal_(self.bbox_centerness.weight, std=0.1) torch.nn.init.constant_(self.bbox_centerness.bias, 0) torch.nn.init.normal_(self.bbox_reg.weight, std=0.02) torch.nn.init.constant_(self.bbox_reg.bias, 0) prior_prob = cfg.RPN3D.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.bbox_cls.bias, bias_value) default_baseline = 0.54 default_fu = 721.5377 default_scale = default_baseline * default_fu self.default_scale = default_scale affine_mat = torch.as_tensor([[[1., 0., 0.], [0., 1., 0.]]]) affine_mat = affine_mat.repeat(self.maxdisp // self.downsample_disp, 1, 1) for i in range(self.maxdisp // self.downsample_disp): depth = ((i + 0.5) * self.downsample_disp + self.cfg.depth_min_intervals) * self.cfg.depth_interval affine_mat[self.maxdisp // self.downsample_disp - 1 - i, 0, 2] = default_scale / depth / self.downsample_disp self.affine_mat = affine_mat # depth: 2.0 -> 40.2 # interval 0.2m # disp : about 194.8 -> 9.69 depth = torch.zeros((self.maxdisp)) for i in range(self.maxdisp): depth[self.maxdisp - 1 - i] = (i+self.cfg.depth_min_intervals) * self.cfg.depth_interval self.depth = depth self.dispregression = disparityregression(self.maxdisp, cfg=self.cfg) self.CV_X_MIN, self.CV_Y_MIN, self.CV_Z_MIN = cfg.CV_X_MIN, cfg.CV_Y_MIN, cfg.CV_Z_MIN self.CV_X_MAX, self.CV_Y_MAX, self.CV_Z_MAX = cfg.CV_X_MAX, cfg.CV_Y_MAX, cfg.CV_Z_MAX self.X_MIN, self.Y_MIN, self.Z_MIN = cfg.X_MIN, cfg.Y_MIN, cfg.Z_MIN self.X_MAX, self.Y_MAX, self.Z_MAX = cfg.X_MAX, cfg.Y_MAX, cfg.Z_MAX self.VOXEL_X_SIZE, self.VOXEL_Y_SIZE, self.VOXEL_Z_SIZE = cfg.VOXEL_X_SIZE, cfg.VOXEL_Y_SIZE, cfg.VOXEL_Z_SIZE self.GRID_SIZE = cfg.GRID_SIZE zs = torch.arange(self.Z_MIN, self.Z_MAX, self.VOXEL_Z_SIZE) + self.VOXEL_Z_SIZE / 2. ys = torch.arange(self.Y_MIN, self.Y_MAX, self.VOXEL_Y_SIZE) + self.VOXEL_Y_SIZE / 2. xs = torch.arange(self.X_MIN, self.X_MAX, self.VOXEL_X_SIZE) + self.VOXEL_X_SIZE / 2. zs, ys, xs = torch.meshgrid(zs, ys, xs) coord_rect = torch.stack([xs, ys, zs], dim=-1) self.coord_rect = coord_rect def forward(self, left, right, calibs_fu, calibs_baseline, calibs_Proj, calibs_Proj_R=None): N = left.shape[0] refimg_fea, left_rpn_feature = self.feature_extraction(left) targetimg_fea, right_rpn_feature = self.feature_extraction(right) outputs = dict() if self.PlaneSweepVolume: affine_mat = self.affine_mat.cuda().clone().unsqueeze(0).repeat(N, 1, 1, 1) affine_mat[:, :, 0, 2] = affine_mat[:, :, 0, 2] * calibs_fu[:,None].cuda().float() * calibs_baseline[:,None].cuda().float() / self.default_scale cost = self.build_cost(refimg_fea, targetimg_fea, affine_mat[:,:,0,2]) cost = cost.contiguous() if not self.hg_firstconv: cost0 = self.dres0(cost) cost0 = self.dres1(cost0) + cost0 else: out0, pre0, post0 = self.dres0(cost, None, None) cost0 = out0 if self.hg_cv: out1, pre1, post1 = self.dres2(cost0, None, None) out1 = out1 + cost0 if self.loss_disp: cost1 = self.classif1(out1) else: cost1 = None out, cost = out1, cost1 else: out0 = cost0 if self.loss_disp: cost0 = self.classif1(out0) else: cost0 = None out, cost = out0, cost0 outputs['depth_preds'] = [] if self.PlaneSweepVolume and self.loss_disp: if self.hg_cv: cost1 = F.upsample(cost1, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=self.cfg.align_corners) cost1 = torch.squeeze(cost1, 1) pred1_softmax = F.softmax(cost1, dim=1) pred1 = self.dispregression(pred1_softmax, depth=self.depth.cuda()) if self.training: outputs['depth_preds'].append( pred1 ) else: outputs['depth_preds'] = pred1 else: cost0 = F.upsample(cost0, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=self.cfg.align_corners) cost0 = torch.squeeze(cost0, 1) pred1_softmax = F.softmax(cost0, dim=1) pred1 = self.dispregression(pred1_softmax, depth=self.depth.cuda()) if self.training: outputs['depth_preds'].append( pred1 ) else: outputs['depth_preds'] = pred1 if self.cfg.RPN3D_ENABLE: coord_rect = self.coord_rect.cuda() norm_coord_imgs = [] for i in range(N): coord_img = torch.as_tensor( project_rect_to_image( coord_rect.reshape(-1, 3), calibs_Proj[i].float().cuda() ).reshape(*self.coord_rect.shape[:3], 2), dtype=torch.float32) coord_img = torch.cat([coord_img, self.coord_rect[..., 2:]], dim=-1) norm_coord_img = (coord_img - torch.as_tensor([self.CV_X_MIN, self.CV_Y_MIN, self.CV_Z_MIN])[None, None, None, :]) / \ (torch.as_tensor([self.CV_X_MAX, self.CV_Y_MAX, self.CV_Z_MAX]) - torch.as_tensor([self.CV_X_MIN, self.CV_Y_MIN, self.CV_Z_MIN]))[None, None, None, :] norm_coord_img = norm_coord_img * 2. - 1. norm_coord_imgs.append(norm_coord_img) norm_coord_imgs = torch.stack(norm_coord_imgs, dim=0) norm_coord_imgs = norm_coord_imgs.cuda() outputs['norm_coord_imgs'] = norm_coord_imgs outputs['coord_rect'] = coord_rect valids = (norm_coord_imgs[..., 0] >= -1.) & (norm_coord_imgs[..., 0] <= 1.) & \ (norm_coord_imgs[..., 1] >= -1.) & (norm_coord_imgs[..., 1] <= 1.) & \ (norm_coord_imgs[..., 2] >= -1.) & (norm_coord_imgs[..., 2] <= 1.) outputs['valids'] = valids valids = valids.float() if self.PlaneSweepVolume: # Retrieve Voxel Feature from Cost Volume Feature if self.cat_disp: CV_feature = torch.cat([out, cost.detach()], dim= 1) else: CV_feature = out Voxel = F.grid_sample(CV_feature, norm_coord_imgs) Voxel = Voxel * valids[:, None, :, :, :] if (self.voxel_attentionbydisp or (self.img_feature_attentionbydisp and self.cat_img_feature)): pred_disp = F.grid_sample(pred1_softmax.detach()[:, None], norm_coord_imgs) pred_disp = pred_disp * valids[:, None, :, :, :] if self.voxel_attentionbydisp: Voxel = Voxel * pred_disp else: Voxel = None # Retrieve Voxel Feature from 2D Img Feature if self.cat_img_feature: RPN_feature = left_rpn_feature valids = (norm_coord_imgs[..., 0] >= -1.) & (norm_coord_imgs[..., 0] <= 1.) & \ (norm_coord_imgs[..., 1] >= -1.) & (norm_coord_imgs[..., 1] <= 1.) valids = valids.float() Voxel_2D = [] for i in range(N): RPN_feature_per_im = RPN_feature[i:i+1] for j in range(len(norm_coord_imgs[i])): Voxel_2D_feature = F.grid_sample(RPN_feature_per_im, norm_coord_imgs[i, j:j+1, :, :, :2]) Voxel_2D.append(Voxel_2D_feature) Voxel_2D = torch.cat(Voxel_2D, dim=0) Voxel_2D = Voxel_2D.reshape(N, self.GRID_SIZE[0], -1, self.GRID_SIZE[1], self.GRID_SIZE[2]).transpose(1,2) Voxel_2D = Voxel_2D * valids[:, None, :, :, :] if self.img_feature_attentionbydisp: Voxel_2D = Voxel_2D * pred_disp if Voxel is not None: Voxel = torch.cat([Voxel, Voxel_2D], dim=1) else: Voxel = Voxel_2D if self.cat_right_img_feature: RPN_feature = right_rpn_feature norm_coord_right_imgs = [] for i in range(N): coord_right_img = torch.as_tensor( project_rect_to_image( coord_rect.reshape(-1, 3), calibs_Proj_R[i].float().cuda() ).reshape(*self.coord_rect.shape[:3], 2), dtype=torch.float32) coord_right_img = torch.cat([coord_right_img, self.coord_rect[..., 2:]], dim=-1) norm_coord_img = (coord_right_img - torch.as_tensor([self.CV_X_MIN, self.CV_Y_MIN, self.CV_Z_MIN])[None, None, None, :]) / \ (torch.as_tensor([self.CV_X_MAX, self.CV_Y_MAX, self.CV_Z_MAX]) - torch.as_tensor([self.CV_X_MIN, self.CV_Y_MIN, self.CV_Z_MIN]))[None, None, None, :] norm_coord_img = norm_coord_img * 2. - 1. norm_coord_right_imgs.append(norm_coord_img) norm_coord_right_imgs = torch.stack(norm_coord_right_imgs, dim=0) norm_coord_right_imgs = norm_coord_right_imgs.cuda() valids_R = (norm_coord_right_imgs[..., 0] >= -1.) & (norm_coord_right_imgs[..., 0] <= 1.) & \ (norm_coord_right_imgs[..., 1] >= -1.) & (norm_coord_right_imgs[..., 1] <= 1.) valids_R = valids_R.float() Voxel_2D_R = [] for i in range(N): RPN_feature_per_im = RPN_feature[i:i+1] for j in range(len(norm_coord_right_imgs[i])): Voxel_2D_feature = F.grid_sample(RPN_feature_per_im, norm_coord_right_imgs[i, j:j+1, :, :, :2]) Voxel_2D_R.append(Voxel_2D_feature) Voxel_2D_R = torch.cat(Voxel_2D_R, dim=0) Voxel_2D_R = Voxel_2D_R.reshape(N, self.GRID_SIZE[0], -1, self.GRID_SIZE[1], self.GRID_SIZE[2]).transpose(1,2) Voxel_2D_R = Voxel_2D_R * valids_R[:, None, :, :, :] if self.img_feature_attentionbydisp: Voxel_2D_R = Voxel_2D_R * pred_disp if Voxel is not None: Voxel = torch.cat([Voxel, Voxel_2D_R], dim=1) else: Voxel = Voxel_2D_R # (64, 190, 20, 300) Voxel = self.rpn3d_conv(Voxel) # (64, 190, 20, 300) if self.num_3dconvs > 1: Voxel = self.rpn_3dconv1(Voxel) if self.num_3dconvs > 2: Voxel = self.rpn_3dconv2(Voxel) if self.num_3dconvs > 3: Voxel = self.rpn_3dconv3(Voxel) if self.hg_rpn_conv3d: Voxel1, pre_Voxel, post_Voxel = self.hg_rpn3d_conv(Voxel, None, None) Voxel = Voxel1 + Voxel Voxel = self.rpn3d_pool(Voxel) # (64, 190, 5, 300) Voxel = Voxel.permute(0, 1, 3, 2, 4).reshape(N, -1, self.GRID_SIZE[0], self.GRID_SIZE[2]).contiguous() Voxel_BEV = self.rpn3d_conv2(Voxel) if not self.hg_rpn_conv: Voxel_BEV = self.rpn3d_conv3(Voxel_BEV) else: Voxel_BEV1, pre_BEV, post_BEV = self.rpn3d_conv3(Voxel_BEV, None, None) Voxel_BEV = Voxel_BEV1 # some bug Voxel_BEV_cls = self.rpn3d_cls_convs(Voxel_BEV) Voxel_BEV_bbox = self.rpn3d_bbox_convs(Voxel_BEV) if self.num_convs > 1: Voxel_BEV_cls = self.rpn3d_cls_convs2(Voxel_BEV_cls) Voxel_BEV_bbox = self.rpn3d_bbox_convs2(Voxel_BEV_bbox) if self.num_convs > 2: Voxel_BEV_cls = self.rpn3d_cls_convs3(Voxel_BEV_cls) Voxel_BEV_bbox = self.rpn3d_bbox_convs3(Voxel_BEV_bbox) if self.num_convs > 3: Voxel_BEV_cls = self.rpn3d_cls_convs4(Voxel_BEV_cls) Voxel_BEV_bbox = self.rpn3d_bbox_convs4(Voxel_BEV_bbox) bbox_cls = self.bbox_cls(Voxel_BEV_cls) if not self.fix_centerness_bug: bbox_reg = self.bbox_reg(Voxel_BEV_cls) bbox_centerness = self.bbox_centerness(Voxel_BEV_bbox) else: bbox_reg = self.bbox_reg(Voxel_BEV_bbox) bbox_centerness = self.bbox_centerness(Voxel_BEV_bbox) # dx, dy, h, w, l, q1, q2, q3, q4, dz N, C, H, W = bbox_reg.shape dxyz, dhwl, angle_reg = torch.split(bbox_reg.reshape(N, self.num_classes, C // self.num_classes, H, W), \ [self.xyz_dim, self.hwl_dim, self.each_angle_dim * self.num_angles], dim=2) # angle / orientation angle_reg = angle_reg.permute(0, 3, 4, 2, 1).reshape(-1, self.each_angle_dim * self.num_angles, self.num_classes) angle_range = np.pi * 2 / self.num_angles q = angle_reg.tanh() * angle_range / 2. q = q + self.anchor_angles.cuda()[None, :, None] sin_d, cos_d = torch.sin(q), torch.cos(q) # XYZ dxyz = dxyz[:, None, :].repeat(1, self.num_angles, 1, 1, 1, 1) dhwl = dhwl.permute(0, 3, 4, 1, 2).reshape(-1, self.num_classes, self.hwl_dim) dhwl = dhwl[:, None, :, :].repeat(1, self.num_angles, 1, 1) hwl = self.anchor_size.cuda().reshape(1, 1, self.num_classes, 3) * torch.exp(dhwl) hwl = hwl.reshape(-1, self.num_angles, self.num_classes, 3) if not self.box_corner_parameters: hwl = hwl.reshape(N, H, W, self.num_angles, self.num_classes, 3) hwl = hwl.permute(0, 3, 4, 5, 1, 2) q = q.reshape(N, H, W, self.num_angles, self.num_classes) q = q.permute(0, 3, 4, 1, 2) # N, num_angles, num_classes, dim, H, W bbox_reg = torch.cat( [dxyz, hwl, q[:, :, :, None]], dim=3) bbox_reg = bbox_reg.reshape(N, self.num_angles * self.num_classes * 7, H, W) else: box_corners = compute_corners_sc( hwl.reshape(-1, 3), sin_d.reshape(-1), cos_d.reshape(-1) ).reshape(N, H, W, self.num_angles, self.num_classes, 3, 8) box_corners[:, :, :, :, :, 1, :] += hwl.reshape(N, H, W, self.num_angles, self.num_classes, 3)[:, :, :, :, :, 0:1] / 2. box_corners = box_corners.permute(0, 3, 4, 6, 5, 1, 2) # (N, num_classes, num_angles, 8, 3, H, W) # (N, num_classes, num_angles, ) bbox_reg = box_corners + dxyz[:, :, :, None] bbox_reg = bbox_reg.reshape(N, self.num_angles * self.num_classes * 24, H, W) outputs['bbox_cls'] = bbox_cls outputs['bbox_reg'] = bbox_reg outputs['bbox_centerness'] = bbox_centerness return outputs ================================================ FILE: disparity/models/stereonet_disp.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) NKU # Licensed under the MIT License. # Written by Xuanyi Li (xuanyili.edu@gmail.com) # ------------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np import torch.backends.cudnn as cudnn def convbn(in_channel, out_channel, kernel_size, stride, pad, dilation): return nn.Sequential( nn.Conv2d( in_channel, out_channel, kernel_size=kernel_size, stride=stride, padding=dilation if dilation>1 else pad, dilation=dilation), nn.BatchNorm2d(out_channel)) def convbn_3d(in_channel, out_channel, kernel_size, stride, pad): return nn.Sequential( nn.Conv3d( in_channel, out_channel, kernel_size=kernel_size, padding=pad, stride=stride), nn.BatchNorm3d(out_channel)) class BasicBlock(nn.Module): def __init__(self, in_channel, out_channel, stride, downsample, pad, dilation): super().__init__() self.conv1 = nn.Sequential( convbn(in_channel, out_channel, 3, stride, pad, dilation), nn.LeakyReLU(negative_slope=0.2, inplace=True)) self.conv2 = convbn(out_channel, out_channel, 3, 1, pad, dilation) self.downsample = downsample self.stride = stride def forward(self, x): out = self.conv1(x) # out = self.conv2(out) if self.downsample is not None: x = self.downsample(x) ### bug? out = x + out return out class FeatureExtraction(nn.Module): def __init__(self, k): super().__init__() self.k = k self.downsample = nn.ModuleList() in_channel = 3 out_channel = 32 for _ in range(k): self.downsample.append( nn.Conv2d( in_channel, out_channel, kernel_size=5, stride=2, padding=2)) in_channel = out_channel out_channel = 32 self.residual_blocks = nn.ModuleList() for _ in range(6): self.residual_blocks.append( BasicBlock( 32, 32, stride=1, downsample=None, pad=1, dilation=1)) self.conv_alone = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1) def forward(self, rgb_img): output = rgb_img for i in range(self.k): output = self.downsample[i](output) for block in self.residual_blocks: output = block(output) return self.conv_alone(output) class EdgeAwareRefinement(nn.Module): def __init__(self, in_channel): super().__init__() self.conv2d_feature = nn.Sequential( convbn(in_channel, 32, kernel_size=3, stride=1, pad=1, dilation=1), nn.LeakyReLU(negative_slope=0.2, inplace=True)) self.residual_astrous_blocks = nn.ModuleList() astrous_list = [1, 2, 4, 8 , 1 , 1] for di in astrous_list: self.residual_astrous_blocks.append( BasicBlock( 32, 32, stride=1, downsample=None, pad=1, dilation=di)) self.conv2d_out = nn.Conv2d(32, 1, kernel_size=3, stride=1, padding=1) def forward(self, low_disparity, corresponding_rgb): output = torch.unsqueeze(low_disparity, dim=1) twice_disparity = F.interpolate( output, size = corresponding_rgb.size()[-2:], mode='bilinear', align_corners=False) if corresponding_rgb.size()[-1]/ low_disparity.size()[-1] >= 1.5: twice_disparity *= 8 output = self.conv2d_feature( torch.cat([twice_disparity, corresponding_rgb], dim=1)) for astrous_block in self.residual_astrous_blocks: output = astrous_block(output) return nn.ReLU(inplace=True)(torch.squeeze( twice_disparity + self.conv2d_out(output), dim=1)) class disparityregression(nn.Module): def __init__(self, maxdisp): super().__init__() self.disp = torch.FloatTensor( np.reshape(np.array(range(maxdisp)), [1, maxdisp, 1, 1])).cuda() def forward(self, x): disp = self.disp.repeat(x.size()[0], 1, x.size()[2], x.size()[3]) out = torch.sum(x * disp, 1) return out class StereoNet(nn.Module): def __init__(self, k=3, r=3, maxdisp=192): super().__init__() self.maxdisp = maxdisp self.k = k self.r = r self.feature_extraction = FeatureExtraction(k) self.filter = nn.ModuleList() for _ in range(4): self.filter.append( nn.Sequential( convbn_3d(32, 32, kernel_size=3, stride=1, pad=1), nn.LeakyReLU(negative_slope=0.2, inplace=True))) self.conv3d_alone = nn.Conv3d( 32, 1, kernel_size=3, stride=1, padding=1) self.edge_aware_refinements = nn.ModuleList() for _ in range(1): self.edge_aware_refinements.append(EdgeAwareRefinement(4)) def forward(self, left, right): disp = (self.maxdisp + 1) // pow(2, self.k) refimg_feature = self.feature_extraction(left) targetimg_feature = self.feature_extraction(right) # matching cost = torch.FloatTensor(refimg_feature.size()[0], refimg_feature.size()[1], disp, refimg_feature.size()[2], refimg_feature.size()[3]).zero_().cuda() for i in range(disp): if i > 0: cost[:, :, i, :, i:] = refimg_feature[ :, :, :, i:] - targetimg_feature[:, :, :, :-i] else: cost[:, :, i, :, :] = refimg_feature - targetimg_feature cost = cost.contiguous() for f in self.filter: cost = f(cost) cost = self.conv3d_alone(cost) cost = torch.squeeze(cost, 1) pred = F.softmax(cost, dim=1) pred = disparityregression(disp)(pred) img_pyramid_list = [left] pred_pyramid_list= [pred] pred_pyramid_list.append(self.edge_aware_refinements[0]( pred_pyramid_list[0], img_pyramid_list[0])) for i in range(1): pred_pyramid_list[i] = pred_pyramid_list[i]* ( left.size()[-1] / pred_pyramid_list[i].size()[-1]) pred_pyramid_list[i] = torch.squeeze( F.interpolate( torch.unsqueeze(pred_pyramid_list[i], dim=1), size=left.size()[-2:], mode='bilinear', align_corners=False), dim=1) #return pred_pyramid_list return pred_pyramid_list if __name__ == '__main__': model = StereoNet(k=3, r=4).cuda() # model.eval() import time import datetime import torch # torch.backends.cudnn.benchmark = True input = torch.FloatTensor(1,3,540,960).zero_().cuda() # input = torch.FloatTensor(1,3,960,512).zero_().cuda() for i in range(100): # pass out = model(input, input) # print(len(out)) start = datetime.datetime.now() for i in range(100): # pass out = model(input, input) # shape = [x.size() for x in out] # print(shape) end = datetime.datetime.now() print((end-start).total_seconds()) ================================================ FILE: disparity/models/submodule.py ================================================ from __future__ import print_function import torch import torch.nn as nn import torch.utils.data from torch.autograd import Variable import torch.nn.functional as F import math import numpy as np from torch.nn import BatchNorm2d def convbn(in_planes, out_planes, kernel_size, stride, pad, dilation, gn=False, groups=32): return nn.Sequential(nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=dilation if dilation > 1 else pad, dilation = dilation, bias=False), nn.BatchNorm2d(out_planes) if not gn else nn.GroupNorm(groups, out_planes)) def convbn_3d(in_planes, out_planes, kernel_size, stride, pad, gn=False, groups=32): return nn.Sequential(nn.Conv3d(in_planes, out_planes, kernel_size=kernel_size, padding=pad, stride=stride,bias=False), nn.BatchNorm3d(out_planes) if not gn else nn.GroupNorm(groups, out_planes)) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride, downsample, pad, dilation, gn=False): super(BasicBlock, self).__init__() self.conv1 = nn.Sequential(convbn(inplanes, planes, 3, stride, pad, dilation, gn=gn), nn.ReLU(inplace=True)) self.conv2 = convbn(planes, planes, 3, 1, pad, dilation, gn=gn) self.downsample = downsample self.stride = stride def forward(self, x): out = self.conv1(x) out = self.conv2(out) if self.downsample is not None: x = self.downsample(x) out += x return out class disparityregression(nn.Module): def __init__(self, maxdisp, cfg): super(disparityregression, self).__init__() self.disp = Variable(torch.Tensor(np.array(range(maxdisp))).cuda(), requires_grad=False) def forward(self, x, depth): out = torch.sum(x * depth[None, :, None, None],1) return out class hourglass(nn.Module): def __init__(self, inplanes, gn=False): super(hourglass, self).__init__() self.conv1 = nn.Sequential(convbn_3d(inplanes, inplanes * 2, kernel_size=3, stride=2, pad=1, gn=gn), nn.ReLU(inplace=True)) self.conv2 = convbn_3d(inplanes * 2, inplanes * 2, kernel_size=3, stride=1, pad=1, gn=gn) self.conv3 = nn.Sequential(convbn_3d(inplanes * 2, inplanes * 2, kernel_size=3, stride=2, pad=1, gn=gn), nn.ReLU(inplace=True)) self.conv4 = nn.Sequential(convbn_3d(inplanes * 2, inplanes * 2, kernel_size=3, stride=1, pad=1, gn=gn), nn.ReLU(inplace=True)) self.conv5 = nn.Sequential( nn.ConvTranspose3d(inplanes * 2, inplanes * 2, kernel_size=3, padding=1, output_padding=1, stride=2, bias=False), nn.BatchNorm3d(inplanes * 2) if not gn else nn.GroupNorm(32, inplanes * 2)) # +conv2 self.conv6 = nn.Sequential( nn.ConvTranspose3d(inplanes * 2, inplanes, kernel_size=3, padding=1, output_padding=1, stride=2, bias=False), nn.BatchNorm3d(inplanes) if not gn else nn.GroupNorm(32, inplanes)) # +x def forward(self, x, presqu, postsqu): out = self.conv1(x) # in:1/4 out:1/8 pre = self.conv2(out) # in:1/8 out:1/8 if postsqu is not None: pre = F.relu(pre + postsqu, inplace=True) else: pre = F.relu(pre, inplace=True) out = self.conv3(pre) # in:1/8 out:1/16 out = self.conv4(out) # in:1/16 out:1/16 if presqu is not None: post = F.relu(self.conv5(out) + presqu, inplace=True) # in:1/16 out:1/8 else: post = F.relu(self.conv5(out) + pre, inplace=True) out = self.conv6(post) # in:1/8 out:1/4 return out, pre, post class hourglass2d(nn.Module): def __init__(self, inplanes, gn=False): super(hourglass2d, self).__init__() self.conv1 = nn.Sequential(convbn(inplanes, inplanes * 2, kernel_size=3, stride=2, pad=1, dilation=1, gn=gn), nn.ReLU(inplace=True)) self.conv2 = convbn(inplanes * 2, inplanes * 2, kernel_size=3, stride=1, pad=1, dilation=1, gn=gn) self.conv3 = nn.Sequential(convbn(inplanes * 2, inplanes * 2, kernel_size=3, stride=2, pad=1, dilation=1, gn=gn), nn.ReLU(inplace=True)) self.conv4 = nn.Sequential(convbn(inplanes * 2, inplanes * 2, kernel_size=3, stride=1, pad=1, dilation=1, gn=gn), nn.ReLU(inplace=True)) self.conv5 = nn.Sequential( nn.ConvTranspose2d(inplanes * 2, inplanes * 2, kernel_size=3, padding=1, output_padding=1, stride=2, bias=False), nn.BatchNorm2d(inplanes * 2) if not gn else nn.GroupNorm(32, inplanes * 2)) # +conv2 self.conv6 = nn.Sequential( nn.ConvTranspose2d(inplanes * 2, inplanes, kernel_size=3, padding=1, output_padding=1, stride=2, bias=False), nn.BatchNorm2d(inplanes) if not gn else nn.GroupNorm(32, inplanes)) # +x def forward(self, x, presqu, postsqu): out = self.conv1(x) # in:1/4 out:1/8 pre = self.conv2(out) # in:1/8 out:1/8 if postsqu is not None: pre = F.relu(pre + postsqu, inplace=True) else: pre = F.relu(pre, inplace=True) out = self.conv3(pre) # in:1/8 out:1/16 out = self.conv4(out) # in:1/16 out:1/16 if presqu is not None: post = F.relu(self.conv5(out) + presqu, inplace=True) # in:1/16 out:1/8 else: post = F.relu(self.conv5(out) + pre, inplace=True) out = self.conv6(post) # in:1/8 out:1/4 return out, pre, post class feature_extraction(nn.Module): def __init__(self, cfg): super(feature_extraction, self).__init__() self.cfg = cfg self.RPN3D_ENABLE = self.cfg.RPN3D_ENABLE self.cat_img_feature = getattr(self.cfg, 'cat_img_feature', False) self.rpn_onemore_conv = getattr(self.cfg, 'RPN_ONEMORE_CONV', False) self.rpn_onemore_dim = getattr(self.cfg, 'RPN_ONEMORE_DIM', 256) self.img_feature_relu = getattr(self.cfg, 'img_feature_relu', True) self.branch = getattr(self.cfg, 'branch', True) self.backbone = getattr(self.cfg, 'backbone', 'reslike-det-small') if self.backbone == 'reslike-det': first_dim = 64 dims = [64, 128, 192, 256] nr_convs = [3, 6, 12, 4] branch_dim = 32 lastconv_dim = [256, 32] elif self.backbone == 'reslike-det-small': first_dim = 64 dims = [32, 64, 128, 192] nr_convs = [3, 6, 12, 4] branch_dim = 32 lastconv_dim = [256, 32] elif self.backbone == 'reslike-det-small-fixfirst': first_dim = 16 dims = [32, 64, 128, 192] nr_convs = [3, 6, 12, 4] branch_dim = 32 lastconv_dim = [256, 32] elif self.backbone == 'reslike50-det-small-fixfirst': first_dim = 16 dims = [32, 64, 128, 256] nr_convs = [3, 4, 6, 3] branch_dim = 32 lastconv_dim = [256, 32] elif self.backbone == 'reslike50-det-tiny': first_dim = 8 dims = [16, 32, 64, 128] nr_convs = [3, 4, 6, 3] branch_dim = 32 lastconv_dim = [128, 32] else: raise ValueError('Invalid backbone {}.'.format(self.backbone)) self.inplanes = first_dim self.firstconv = nn.Sequential(convbn(3, first_dim, 3, 2, 1, 1, gn=cfg.GN if first_dim >= 32 else False), nn.ReLU(inplace=True), convbn(first_dim, first_dim, 3, 1, 1, 1, gn=cfg.GN if first_dim >= 32 else False), nn.ReLU(inplace=True), convbn(first_dim, first_dim, 3, 1, 1, 1, gn=cfg.GN if first_dim >= 32 else False), nn.ReLU(inplace=True)) self.layer1 = self._make_layer(BasicBlock, dims[0], nr_convs[0], 1,1,1, gn=cfg.GN if dims[0] >= 32 else False) self.layer2 = self._make_layer(BasicBlock, dims[1], nr_convs[1], 2,1,1, gn=cfg.GN) self.layer3 = self._make_layer(BasicBlock, dims[2], nr_convs[2], 1,1,1, gn=cfg.GN) self.layer4 = self._make_layer(BasicBlock, dims[3], nr_convs[3], 1,1,2, gn=cfg.GN) if self.branch: self.branch1 = nn.Sequential(nn.AvgPool2d((64, 64), stride=(64,64)), convbn(dims[3], branch_dim, 1, 1, 0, 1, gn=cfg.GN, groups=min(32, branch_dim)), nn.ReLU(inplace=True)) self.branch2 = nn.Sequential(nn.AvgPool2d((32, 32), stride=(32,32)), convbn(dims[3], branch_dim, 1, 1, 0, 1, gn=cfg.GN, groups=min(32, branch_dim)), nn.ReLU(inplace=True)) self.branch3 = nn.Sequential(nn.AvgPool2d((16, 16), stride=(16,16)), convbn(dims[3], branch_dim, 1, 1, 0, 1, gn=cfg.GN, groups=min(32, branch_dim)), nn.ReLU(inplace=True)) self.branch4 = nn.Sequential(nn.AvgPool2d((8, 8), stride=(8,8)), convbn(dims[3], branch_dim, 1, 1, 0, 1, gn=cfg.GN, groups=min(32, branch_dim)), nn.ReLU(inplace=True)) if self.branch: concat_dim = branch_dim * 4 + dims[1] + dims[3] + dims[2] else: concat_dim = dims[1] + dims[3] + dims[2] self.PlaneSweepVolume = getattr(cfg, 'PlaneSweepVolume', True) if self.PlaneSweepVolume: self.lastconv = nn.Sequential(convbn(concat_dim, lastconv_dim[0], 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True), nn.Conv2d(lastconv_dim[0], lastconv_dim[1], kernel_size=1, padding=0, stride = 1, bias=False)) if self.cfg.RPN3D_ENABLE and self.cat_img_feature: if self.rpn_onemore_conv: rpnconvs = [convbn(concat_dim, self.rpn_onemore_dim, 3, 1, 1, 1, gn=cfg.GN), nn.ReLU(inplace=True), convbn(self.rpn_onemore_dim, self.cfg.RPN_CONVDIM, 3, 1, 1, 1, gn=cfg.GN, groups=(32 if self.cfg.RPN_CONVDIM % 32 == 0 else 16))] else: rpnconvs = [convbn(concat_dim, self.cfg.RPN_CONVDIM, 3, 1, 1, 1, gn=cfg.GN, groups=(32 if self.cfg.RPN_CONVDIM % 32 == 0 else 16))] if self.img_feature_relu: rpnconvs.append( nn.ReLU(inplace=True) ) self.rpnconv = nn.Sequential( *rpnconvs ) def _make_layer(self, block, planes, blocks, stride, pad, dilation, gn=False): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion) if not gn else nn.GroupNorm(32, planes * block.expansion)) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, pad, dilation, gn=gn)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes,1,None,pad,dilation, gn=gn)) return nn.Sequential(*layers) def forward(self, x): output = self.firstconv(x) ; #print('conv1', output.shape) # (1, 32, 192, 624) output = self.layer1(output) ; #print('conv2', output.shape) # (1, 32, 192, 624) output_raw = self.layer2(output) ; #print('conv3', output_raw.shape) # (1, 64, 96, 312) output_mid = self.layer3(output_raw) ; #print('conv4', output.shape) # (1, 128, 96, 312) output_skip = self.layer4(output_mid) ; #print('conv5', output_skip.shape) # (1, 128, 96, 312) if self.branch: output_branch1 = self.branch1(output_skip) ; #print('b1', output_branch1.shape) # (1, 32, 1, 4) # avgpool 64 output_branch1 = F.interpolate(output_branch1, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear', align_corners=self.cfg.align_corners) # (1, 32, 96, 312) output_branch2 = self.branch2(output_skip) ; #print('b2', output_branch2.shape)# (1, 32, 3, 9) output_branch2 = F.interpolate(output_branch2, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear', align_corners=self.cfg.align_corners) output_branch3 = self.branch3(output_skip) ; #print('b3', output_branch3.shape)# (1, 32, 6, 19) output_branch3 = F.interpolate(output_branch3, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear', align_corners=self.cfg.align_corners) output_branch4 = self.branch4(output_skip) ; #print('b4', output_branch4.shape)# (1, 32, 12, 39) output_branch4 = F.interpolate(output_branch4, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear', align_corners=self.cfg.align_corners) if self.branch: concat_feature = torch.cat((output_raw, output_mid, output_skip, output_branch4, output_branch3, output_branch2, output_branch1), 1) ; #print('cat', concat_feature.shape) else: concat_feature = torch.cat((output_raw, output_mid, output_skip), 1) if self.RPN3D_ENABLE and self.cat_img_feature: rpn_feature = self.rpnconv(concat_feature) else: rpn_feature = None if self.PlaneSweepVolume: output_feature = self.lastconv(concat_feature) ; #print('last', output_feature.shape) else: output_feature = None return output_feature, rpn_feature ================================================ FILE: disparity/utils/__init__.py ================================================ ================================================ FILE: disparity/utils/logger.py ================================================ import logging import os def setup_logger(filepath): file_formatter = logging.Formatter( "[%(asctime)s %(filename)s:%(lineno)s] %(levelname)-8s %(message)s", datefmt='%Y-%m-%d %H:%M:%S', ) logger = logging.getLogger('example') print(logger) handler = logging.StreamHandler() handler.setFormatter(file_formatter) logger.addHandler(handler) file_handle_name = "file" if file_handle_name in [h.name for h in logger.handlers]: print(logger.handlers) #return if os.path.dirname(filepath) is not '': if not os.path.isdir(os.path.dirname(filepath)): os.makedirs(os.path.dirname(filepath)) file_handle = logging.FileHandler(filename=filepath, mode="a") file_handle.set_name(file_handle_name) file_handle.setFormatter(file_formatter) logger.addHandler(file_handle) logger.setLevel(logging.DEBUG) return logger ================================================ FILE: disparity/utils/preprocess.py ================================================ import torch import torchvision.transforms as transforms import torchvision import random import numpy as np __imagenet_stats = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]} #__imagenet_stats = {'mean': [0.5, 0.5, 0.5], # 'std': [0.5, 0.5, 0.5]} __imagenet_pca = { 'eigval': torch.Tensor([0.2175, 0.0188, 0.0045]), 'eigvec': torch.Tensor([ [-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203], ]) } # def scale_crop(input_size, scale_size=None, normalize=__imagenet_stats): # t_list = [ # transforms.ToTensor(), # transforms.Normalize(**normalize), # ] # #if scale_size != input_size: # #t_list = [transforms.Scale((960,540))] + t_list # return transforms.Compose(t_list) def scale_crop(input_size, scale_size=None, normalize=__imagenet_stats): t_list = [ transforms.ToTensor(), transforms.Normalize(**normalize), ] return transforms.Compose(t_list) def scale_random_crop(input_size, scale_size=None, normalize=__imagenet_stats): t_list = [ transforms.RandomCrop(input_size), transforms.ToTensor(), transforms.Normalize(**normalize), ] if scale_size != input_size: t_list = [transforms.Scale(scale_size)] + t_list transforms.Compose(t_list) def pad_random_crop(input_size, scale_size=None, normalize=__imagenet_stats): padding = int((scale_size - input_size) / 2) return transforms.Compose([ transforms.RandomCrop(input_size, padding=padding), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(**normalize), ]) def inception_preproccess(input_size, normalize=__imagenet_stats): return transforms.Compose([ transforms.RandomSizedCrop(input_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(**normalize) ]) def inception_color_preproccess(input_size, normalize=__imagenet_stats): return transforms.Compose([ transforms.ToTensor(), ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, ), Lighting(0.1, __imagenet_pca['eigval'], __imagenet_pca['eigvec']), transforms.Normalize(**normalize) ]) # bright = np.random.uniform(0.8, 1.2) # contrast = np.random.uniform(0.8, 1.2) # return transforms.Compose([ # #transforms.RandomSizedCrop(input_size), # #transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # ColorJitter( # brightness=bright, # contrast=contrast, # saturation=0, # ), # ]) def get_transform(name='imagenet', input_size=None, scale_size=None, normalize=None, augment=True): normalize = __imagenet_stats # normalize={'mean': [0., 0., 0.], 'std': [1, 1, 1]} # normalize={'mean': [1., 1., 1.], 'std': [1, 1, 1]} input_size = 256 if augment: return inception_color_preproccess(input_size, normalize=normalize) else: return scale_crop(input_size=input_size, scale_size=scale_size, normalize=normalize) class Lighting(object): """Lighting noise(AlexNet - style PCA - based noise)""" def __init__(self, alphastd, eigval, eigvec): self.alphastd = alphastd self.eigval = eigval self.eigvec = eigvec def __call__(self, img): if self.alphastd == 0: return img alpha = img.new().resize_(3).normal_(0, self.alphastd) rgb = self.eigvec.type_as(img).clone()\ .mul(alpha.view(1, 3).expand(3, 3))\ .mul(self.eigval.view(1, 3).expand(3, 3))\ .sum(1).squeeze() return img.add(rgb.view(3, 1, 1).expand_as(img)) class Grayscale(object): def __call__(self, img): gs = img.clone() gs[0].mul_(0.299).add_(0.587, gs[1]).add_(0.114, gs[2]) gs[1].copy_(gs[0]) gs[2].copy_(gs[0]) return gs class Saturation(object): def __init__(self, var): self.var = var def __call__(self, img): gs = Grayscale()(img) alpha = random.uniform(0, self.var) return img.lerp(gs, alpha) class Brightness(object): def __init__(self, var): self.var = var def __call__(self, img): img = img*255 gs = img.new().resize_as_(img).zero_() alpha = random.uniform(0, self.var) return img.lerp(gs, alpha) class Contrast(object): def __init__(self, var): self.var = var def __call__(self, img): # img = img*255 gs = Grayscale()(img) gs.fill_(gs.mean()) alpha = random.uniform(0, self.var) return img.lerp(gs, alpha) class RandomOrder(object): """ Composes several transforms together in random order. """ def __init__(self, transforms): self.transforms = transforms def __call__(self, img): if self.transforms is None: return img order = torch.randperm(len(self.transforms)) for i in order: img = self.transforms[i](img) return img class ColorJitter(RandomOrder): def __init__(self, brightness=0.4, contrast=0.4, saturation=0.4): self.transforms = [] if brightness != 0: self.transforms.append(Brightness(brightness)) if contrast != 0: self.transforms.append(Contrast(contrast)) if saturation != 0: self.transforms.append(Saturation(saturation)) def get_transform_unsym(left_img, right_img, size=[512, 960]): # photometric unsymmetric-augmentation random_brightness = np.random.uniform(0.95, 1.05,2) # random_gamma = np.random.uniform(0.8, 1.2,2) random_contrast = np.random.uniform(0.95, 1.05,2) left_img = torchvision.transforms.functional.adjust_brightness(left_img, random_brightness[0]) # left_img = torchvision.transforms.functional.adjust_gamma(left_img, random_gamma[0]) left_img = torchvision.transforms.functional.adjust_contrast(left_img, random_contrast[0]) right_img = torchvision.transforms.functional.adjust_brightness(right_img, random_brightness[1]) # right_img = torchvision.transforms.functional.adjust_gamma(right_img, random_gamma[1]) right_img = torchvision.transforms.functional.adjust_contrast(right_img, random_contrast[1]) right_img = np.asarray(right_img) left_img = np.asarray(left_img) return left_img, right_img ================================================ FILE: disparity/utils/readpfm.py ================================================ import re import numpy as np import sys def readPFM(file): file = open(file, 'rb') color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header == 'PF': color = True elif header == 'Pf': color = False else: raise Exception('Not a PFM file.') dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline()) if dim_match: width, height = map(int, dim_match.groups()) else: raise Exception('Malformed PFM header.') scale = float(file.readline().rstrip()) if scale < 0: # little-endian endian = '<' scale = -scale else: endian = '>' # big-endian data = np.fromfile(file, endian + 'f') shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale ================================================ FILE: disparity/utils/tensorboardx.py ================================================ from tensorboardX import SummaryWriter import numpy as np writer = SummaryWriter(log_dir='/disk1/hyj/DFAStereo/ver2.0/runs') for epoch in range(100): writer.add_scalar('/scalar/test',np.random.rand(),epoch) writer.add_scalars('/scalar/scalars_test',{'stage0 test':epoch*np.sin(epoch),'stage0 train':epoch*np.cos(epoch), 'stage1 test': epoch * np.sin(epoch)+20, 'stage1 train': epoch * np.cos(epoch)+20},epoch) writer.close() ================================================ FILE: disparity/utils/utils.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) NKU # Licensed under the MIT License. # Written by Xuanyi Li (xuanyili.edu@gmail.com) # ------------------------------------------------------------------------------ import os import torch import torch.nn.functional as F import cv2 as cv import numpy as np def GERF_loss(GT, pred, args): # mask = (GT < args.maxdisp) & (GT >= 0) mask = GT > 0 mask.detach_() # print(mask.size(), GT.size(), pred.size()) count = len(torch.nonzero(mask)) # print(count) if count == 0: count = 1 return torch.sum(torch.sqrt(torch.pow(GT[mask] - pred[mask], 2) + 4) /2 - 1) / count def smooth_L1_loss(GT, pred, args): mask = GT < args.maxdisp mask.detach_() # loss = F.smooth_l1_loss(pred[mask], GT[mask], size_average=True) loss = (pred[mask] - GT[mask]).abs().mean() return loss if __name__ == '__main__': # import matplotlib.pyplot as plt # image = cv.imread('/media/lxy/sdd1/ActiveStereoNet/StereoNet_pytorch/results/forvideo/iter-122.jpg') im_gray = cv.imread('/media/lxy/sdd1/ActiveStereoNet/StereoNet_pytorch/results/forvideo/iter-133.jpg', cv.IMREAD_GRAYSCALE) # print(im_gray.shape) im_color = cv.applyColorMap(im_gray*2, cv.COLORMAP_JET) # cv.imshow('test', im_color) # cv.waitKey(0) cv.imwrite('test.png',im_color) # print(image.shape) # plt.figure('Image') # sc =plt.imshow(image) # sc.set_cmap('hsv') # plt.colorbar() # plt.axis('off') # plt.show() # print('end') # image[:,:,0].save('/media/lxy/sdd1/ActiveStereoNet/StereoNet_pytorch/results/pretrained_StereoNet_single/it1er-151.jpg') ================================================ FILE: preprocessing/generate_disp.py ================================================ import argparse import os import numpy as np import scipy.misc as ssc import kitti_util import imageio DEPTH_AS_DISP = True def generate_dispariy_from_velo(pc_velo, height, width, calib, depth_as_disp=False, baseline=0.54): pts_2d = calib.project_velo_to_image(pc_velo) fov_inds = (pts_2d[:, 0] < width - 1) & (pts_2d[:, 0] >= 0) & \ (pts_2d[:, 1] < height - 1) & (pts_2d[:, 1] >= 0) fov_inds = fov_inds & (pc_velo[:, 0] > 2) imgfov_pc_velo = pc_velo[fov_inds, :] imgfov_pts_2d = pts_2d[fov_inds, :] imgfov_pc_rect = calib.project_velo_to_rect(imgfov_pc_velo) depth_map = np.zeros((height, width)) - 1 imgfov_pts_2d = np.round(imgfov_pts_2d).astype(int) for i in range(imgfov_pts_2d.shape[0]): depth = imgfov_pc_rect[i, 2] depth_map[int(imgfov_pts_2d[i, 1]), int(imgfov_pts_2d[i, 0])] = depth if depth_as_disp: return depth_map disp_map = (calib.f_u * baseline) / depth_map return disp_map if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate Disparity') parser.add_argument('--data_path', type=str, default='~/Kitti/object/training/') parser.add_argument('--split_file', type=str, default='~/Kitti/object/train.txt') parser.add_argument('--right_calib', action='store_true', default=False) args = parser.parse_args() assert os.path.isdir(args.data_path) lidar_dir = args.data_path + '/velodyne/' calib_dir = args.data_path + '/calib/' image_dir = args.data_path + '/image_2/' if DEPTH_AS_DISP: disparity_dir = args.data_path + '/depth/' else: disparity_dir = args.data_path + '/disparity/' assert os.path.isdir(lidar_dir) assert os.path.isdir(calib_dir) assert os.path.isdir(image_dir) if not os.path.isdir(disparity_dir): os.makedirs(disparity_dir) lidar_files = [x for x in os.listdir(lidar_dir) if x[-3:] == 'bin'] lidar_files = sorted(lidar_files) assert os.path.isfile(args.split_file) with open(args.split_file, 'r') as f: file_names = [x.strip() for x in f.readlines()] for fn in lidar_files: predix = fn[:-4] if predix not in file_names: continue calib_file = '{}/{}.txt'.format(calib_dir, predix) calib = kitti_util.Calibration(calib_file, right_calib=args.right_calib) # load point cloud lidar = np.fromfile(lidar_dir + '/' + fn, dtype=np.float32).reshape((-1, 4))[:, :3] image_file = '{}/{}.png'.format(image_dir, predix) image = imageio.imread(image_file) height, width = image.shape[:2] print('calib baseline {}'.format(calib.baseline)) disp = generate_dispariy_from_velo(lidar, height, width, calib, depth_as_disp=DEPTH_AS_DISP, baseline=calib.baseline) np.save(disparity_dir + '/' + predix + ('_r' if args.right_calib else ''), disp) print('Finish Disparity {}'.format(predix + ('_r' if args.right_calib else ''))) ================================================ FILE: preprocessing/generate_lidar.py ================================================ import argparse import os import numpy as np import scipy.misc as ssc import kitti_util import imageio def project_disp_to_depth(calib, disp, max_high, baseline=0.54): disp[disp < 0] = 0 mask = disp > 0 depth = calib.f_u * baseline / (disp + 1. - mask) rows, cols = depth.shape c, r = np.meshgrid(np.arange(cols), np.arange(rows)) points = np.stack([c, r, depth]) points = points.reshape((3, -1)) points = points.T points = points[mask.reshape(-1)] cloud = calib.project_image_to_velo(points) valid = (cloud[:, 0] >= 0) & (cloud[:, 2] < max_high) return cloud[valid] if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate Libar') parser.add_argument('--calib_dir', type=str, default='~/Kitti/object/training/calib') parser.add_argument('--disparity_dir', type=str, default='~/Kitti/object/training/predicted_disparity') parser.add_argument('--save_dir', type=str, default='~/Kitti/object/training/predicted_velodyne') parser.add_argument('--max_high', type=int, default=1) args = parser.parse_args() assert os.path.isdir(args.disparity_dir) assert os.path.isdir(args.calib_dir) if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) disps = [x for x in os.listdir(args.disparity_dir) if x[-3:] == 'png'] disps = sorted(disps) for fn in disps: predix = fn[:-4] calib_file = '{}/{}.txt'.format(args.calib_dir, predix) calib = kitti_util.Calibration(calib_file) disp_map = imageio.imread(args.disparity_dir + '/' + fn) / 256. lidar = project_disp_to_depth(calib, disp_map, args.max_high) # pad 1 in the indensity dimension lidar = np.concatenate([lidar, np.ones((lidar.shape[0], 1))], 1) lidar = lidar.astype(np.float32) lidar.tofile('{}/{}.bin'.format(args.save_dir, predix)) print('Finish Depth {}'.format(predix)) ================================================ FILE: preprocessing/kitti_util.py ================================================ """ Helper methods for loading and parsing KITTI data. Author: Charles R. Qi Date: September 2017 """ from __future__ import print_function import numpy as np class Calibration(object): ''' Calibration matrices and utils 3d XYZ in