Repository: baowenbo/DAIN Branch: master Commit: 7c727aca5676 Files: 123 Total size: 560.1 KB Directory structure: gitextract_7t87l58_/ ├── .gitignore ├── AverageMeter.py ├── Colab_DAIN.ipynb ├── LICENSE ├── MegaDepth/ │ ├── LICENSE │ ├── MegaDepth_model.py │ ├── README.md │ ├── SDR_compute.py │ ├── __init__.py │ ├── data/ │ │ ├── __init__.py │ │ ├── aligned_data_loader.py │ │ ├── base_data_loader.py │ │ ├── data_loader.py │ │ └── image_folder.py │ ├── models/ │ │ ├── HG_model.py │ │ ├── __init__.py │ │ ├── base_model.py │ │ └── models.py │ ├── options/ │ │ ├── __init__.py │ │ ├── base_options.py │ │ ├── test_options.py │ │ └── train_options.py │ ├── pytorch_DIW_scratch.py │ ├── rmse_error_main.py │ └── util/ │ ├── __init__.py │ ├── html.py │ ├── image_pool.py │ ├── png.py │ ├── util.py │ └── visualizer.py ├── PWCNet/ │ ├── PWCNet.py │ ├── __init__.py │ ├── correlation_package_pytorch1_0/ │ │ ├── __init__.py │ │ ├── build.sh │ │ ├── clean.sh │ │ ├── correlation.py │ │ ├── correlation_cuda.cc │ │ ├── correlation_cuda_kernel.cu │ │ ├── correlation_cuda_kernel.cuh │ │ └── setup.py │ └── models/ │ ├── PWCNet.py │ └── __init__.py ├── README.md ├── Resblock/ │ ├── BasicBlock.py │ └── __init__.py ├── S2D_models/ │ ├── S2DF.py │ └── __init__.py ├── Stack.py ├── balancedsampler.py ├── colab_interpolate.py ├── datasets/ │ ├── Vimeo_90K_interp.py │ ├── __init__.py │ └── listdatasets.py ├── demo_MiddleBury.py ├── demo_MiddleBury_slowmotion.py ├── environment.yaml ├── loss_function.py ├── lr_scheduler.py ├── my_args.py ├── my_package/ │ ├── DepthFlowProjection/ │ │ ├── DepthFlowProjectionLayer.py │ │ ├── DepthFlowProjectionModule.py │ │ ├── __init__.py │ │ ├── depthflowprojection_cuda.cc │ │ ├── depthflowprojection_cuda_kernel.cu │ │ ├── depthflowprojection_cuda_kernel.cuh │ │ └── setup.py │ ├── FilterInterpolation/ │ │ ├── FilterInterpolationLayer.py │ │ ├── FilterInterpolationModule.py │ │ ├── __init__.py │ │ ├── filterinterpolation_cuda.cc │ │ ├── filterinterpolation_cuda_kernel.cu │ │ ├── filterinterpolation_cuda_kernel.cuh │ │ └── setup.py │ ├── FlowProjection/ │ │ ├── FlowProjectionLayer.py │ │ ├── FlowProjectionModule.py │ │ ├── __init__.py │ │ ├── flowprojection_cuda.cc │ │ ├── flowprojection_cuda_kernel.cu │ │ ├── flowprojection_cuda_kernel.cuh │ │ └── setup.py │ ├── Interpolation/ │ │ ├── InterpolationLayer.py │ │ ├── InterpolationModule.py │ │ ├── __init__.py │ │ ├── interpolation_cuda.cc │ │ ├── interpolation_cuda_kernel.cu │ │ ├── interpolation_cuda_kernel.cuh │ │ └── setup.py │ ├── InterpolationCh/ │ │ ├── InterpolationChLayer.py │ │ ├── InterpolationChModule.py │ │ ├── __init__.py │ │ ├── interpolationch_cuda.cc │ │ ├── interpolationch_cuda_kernel.cu │ │ ├── interpolationch_cuda_kernel.cuh │ │ └── setup.py │ ├── MinDepthFlowProjection/ │ │ ├── __init__.py │ │ ├── minDepthFlowProjectionLayer.py │ │ ├── minDepthFlowProjectionModule.py │ │ ├── mindepthflowprojection_cuda.cc │ │ ├── mindepthflowprojection_cuda_kernel.cu │ │ ├── mindepthflowprojection_cuda_kernel.cuh │ │ └── setup.py │ ├── SeparableConv/ │ │ ├── SeparableConvLayer.py │ │ ├── SeparableConvModule.py │ │ ├── __init__.py │ │ ├── separableconv_cuda.cc │ │ ├── separableconv_cuda_kernel.cu │ │ ├── separableconv_cuda_kernel.cuh │ │ └── setup.py │ ├── SeparableConvFlow/ │ │ ├── SeparableConvFlowLayer.py │ │ ├── SeparableConvFlowModule.py │ │ ├── __init__.py │ │ ├── separableconvflow_cuda.cc │ │ ├── separableconvflow_cuda_kernel.cu │ │ ├── separableconvflow_cuda_kernel.cuh │ │ └── setup.py │ ├── build.sh │ ├── clean.sh │ ├── compiler_args.py │ └── test_module.py ├── networks/ │ ├── DAIN.py │ ├── DAIN_slowmotion.py │ └── __init__.py └── train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Ignore Git here .git # But not these files... # !.gitignore checkpoints/test_local/opt.txt PWCNet/pwc_net.pth.tar MegaDepth/checkpoints/* model_weights/* MiddleBurySet/* .nfs* # Created by .ignore support plugin (hsz.mobi) ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject ### VirtualEnv template # Virtualenv # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ .Python [Bb]in [Ii]nclude [Ll]ib [Ll]ib64 [Ll]ocal [Ss]cripts pyvenv.cfg .venv pip-selfcheck.json ### JetBrains template # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff: .idea/workspace.xml .idea/tasks.xml .idea/dictionaries .idea/vcs.xml .idea/jsLibraryMappings.xml # Sensitive or high-churn files: .idea/dataSources.ids .idea/dataSources.xml .idea/dataSources.local.xml .idea/sqlDataSources.xml .idea/dynamic.xml .idea/uiDesigner.xml # Gradle: .idea/gradle.xml .idea/libraries # Mongo Explorer plugin: .idea/mongoSettings.xml .idea/ ## File-based project format: *.iws ## Plugin-specific files: # IntelliJ /out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties ================================================ FILE: AverageMeter.py ================================================ class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count ================================================ FILE: Colab_DAIN.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Colab_DAIN_new.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "1pIo4r_Y8cMo" }, "source": [ "# DAIN Colab" ] }, { "cell_type": "markdown", "metadata": { "id": "iGPHW5SOpPe3" }, "source": [ "*DAIN Colab, v1.6.0*\n", "\n", "Based on the [original Colab file](https://github.com/baowenbo/DAIN/issues/44) by btahir. \n", "\n", "Enhancements by [Styler00Dollar](https://github.com/styler00dollar) aka \"sudo rm -rf / --no-preserve-root#8353\" on discord and [Alpha](https://github.com/AlphaGit), (Alpha#6137 on Discord). Please do not run this command in your linux terminal. It's rather meant as a joke.\n", "\n", "[Styler00Dollar's fork](https://github.com/styler00dollar/DAIN) / [Alpha's fork](https://github.com/AlphaGit/DAIN)\n", "\n", "A simple guide:\n", "- Upload this ` .ipynb` file to your Google Colab.\n", "- Create a folder inside of Google Drive named \"DAIN\"\n", "- Change the configurations in the next cell\n", "- Run cells one by one\n", "\n", "Stuff that should be improved:\n", "- Alpha channel will be removed automatically and won't be added back. Anything related to alpha will be converted to black.\n", "- Adding configuration to select speed\n", "- Detect scenes to avoid interpolating scene-changes\n", "- Auto-resume\n", "- Copy `start_frame` - `end_frame` audio from original input to final output\n" ] }, { "cell_type": "code", "metadata": { "id": "enKoi0TR2fOD", "cellView": "form" }, "source": [ "################# Required Configurations ############################\n", "\n", "#@markdown # Required Configuration\n", "#@markdown Use the values in here to configure what you'd like DAIN to do.\n", "\n", "#@markdown ## Input file\n", "#@markdown Path (relative to the root of your Google Drive) to the input file. For instance, if you save your `example.mkv` file in your Google Drive, inside a `videos` folder, the path would be: `videos/example.mkv`. Currenly videos and gifs are supported.\n", "INPUT_FILEPATH = \"DAIN/input.mp4\" #@param{type:\"string\"}\n", "\n", "#@markdown ## Output file\n", "#@markdown Output file path: path (relative to the root of your Google Drive) for the output file. It will also determine the filetype in the destination. `.mp4` is recommended for video input, `.gif` for gif inputs.\n", "OUTPUT_FILE_PATH = \"DAIN/output.mp4\" #@param{type:\"string\"}\n", "\n", "################# Optional configurations ############################\n", "\n", "#@markdown # Optional Configuration\n", "#@markdown Parameters below can be left with their defaults, but feel free to adapt them to your needs.\n", "\n", "#@markdown ## Target FPS\n", "#@markdown how many frames per second should the result have. This will determine how many intermediate images are interpolated.\n", "TARGET_FPS = 60 #@param{type:\"number\"}\n", "\n", "#@markdown ## Frame input directory\n", "#@markdown A path, relative to your GDrive root, where you already have the list of frames in the format 00001.png, 00002.png, etc.\n", "FRAME_INPUT_DIR = '/content/DAIN/input_frames' #@param{type:\"string\"}\n", "\n", "#@markdown ## Frame output directory\n", "#@markdown A path, relative to your GDrive root, where you want the generated frame.\n", "FRAME_OUTPUT_DIR = '/content/DAIN/output_frames' #@param{type:\"string\"}\n", "\n", "#@markdown ## Start Frame\n", "#@markdown First frame to consider from the video when processing.\n", "START_FRAME = 1 #@param{type:\"number\"}\n", "\n", "#@markdown ## End Frame\n", "#@markdown Last frame to consider from the video when processing. To use the whole video use `-1`.\n", "END_FRAME = -1 #@param{type:\"number\"}\n", "\n", "#@markdown ## Seamless playback\n", "#@markdown Creates a seamless loop by using the first frame as last one as well. Set this to True this if loop is intended.\n", "SEAMLESS = False #@param{type:\"boolean\"}\n", "\n", "#@markdown ## Auto-remove PNG directory\n", "#@markdown Auto-delete output PNG dir after ffmpeg video creation. Set this to `False` if you want to keep the PNG files.\n", "AUTO_REMOVE = True #@param{type:\"boolean\"}" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "N9cGwalNeyk9", "cellView": "form" }, "source": [ "#@title Connect Google Drive\n", "from google.colab import drive\n", "drive.mount('/content/gdrive')\n", "print('Google Drive connected.')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "irzjv1x4e3S4", "cellView": "form" }, "source": [ "#@title Check your current GPU\n", "# If you are lucky, you get 16GB VRAM. If you are not lucky, you get less. VRAM is important. The more VRAM, the higher the maximum resolution will go.\n", "\n", "# 16GB: Can handle 720p. 1080p will procude an out-of-memory error. \n", "# 8GB: Can handle 480p. 720p will produce an out-of-memory error.\n", "\n", "!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "UYHTTP91oMvh" }, "source": [ "# Install dependencies.\n", "\n", "This next step may take somewhere between 15-20 minutes. Run this only once at startup.\n", "\n", "Look for the \"Finished installing dependencies\" message." ] }, { "cell_type": "code", "metadata": { "id": "e5AHGetTRacZ", "cellView": "form" }, "source": [ "#@title Setup everything. This takes a while. Just wait ~20 minutes in total.\n", "\n", "# Install old pytorch to avoid faulty output\n", "%cd /content/\n", "!wget -c https://repo.anaconda.com/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n", "!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh\n", "!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local\n", "!conda install pytorch==1.1 cudatoolkit torchvision -c pytorch -y\n", "!conda install ipykernel -y\n", "\n", "!pip install scipy==1.1.0\n", "!pip install imageio\n", "!CUDA_VISIBLE_DEVICES=0\n", "!sudo apt-get install imagemagick imagemagick-doc\n", "print(\"Finished installing dependencies.\")\n", "\n", "# Clone DAIN sources\n", "%cd /content\n", "!git clone -b master --depth 1 https://github.com/baowenbo/DAIN /content/DAIN\n", "%cd /content/DAIN\n", "!git log -1\n", "\n", "# Building DAIN\n", "%cd /content/DAIN/my_package/\n", "!./build.sh\n", "print(\"Building #1 done.\")\n", "\n", "# Building DAIN PyTorch correlation package.\n", "%cd /content/DAIN/PWCNet/correlation_package_pytorch1_0\n", "!./build.sh\n", "print(\"Building #2 done.\")\n", "\n", "# Downloading pre-trained model\n", "%cd /content/DAIN\n", "!mkdir model_weights\n", "!wget -O model_weights/best.pth http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zm5kn6vTncL4", "cellView": "form" }, "source": [ "#@title Detecting FPS of input file.\n", "%shell yes | cp -f /content/gdrive/My\\ Drive/{INPUT_FILEPATH} /content/DAIN/\n", "\n", "import os\n", "filename = os.path.basename(INPUT_FILEPATH)\n", "\n", "import cv2\n", "cap = cv2.VideoCapture(f'/content/DAIN/{filename}')\n", "\n", "fps = cap.get(cv2.CAP_PROP_FPS)\n", "print(f\"Input file has {fps} fps\")\n", "\n", "if(fps/TARGET_FPS>0.5):\n", " print(\"Define a higher fps, because there is not enough time for new frames. (Old FPS)/(New FPS) should be lower than 0.5. Interpolation will fail if you try.\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9YNva-GuKq4Y", "cellView": "form" }, "source": [ "#@title ffmpeg extract - Generating individual frame PNGs from the source file.\n", "%shell rm -rf '{FRAME_INPUT_DIR}'\n", "%shell mkdir -p '{FRAME_INPUT_DIR}'\n", "\n", "if (END_FRAME==-1):\n", " %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=gte(n\\,{START_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n", "else:\n", " %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=between(n\\,{START_FRAME}\\,{END_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n", "\n", "from IPython.display import clear_output\n", "clear_output()\n", "\n", "png_generated_count_command_result = %shell ls '{FRAME_INPUT_DIR}' | wc -l\n", "frame_count = int(png_generated_count_command_result.output.strip())\n", "\n", "import shutil\n", "if SEAMLESS:\n", " frame_count += 1\n", " first_frame = f\"{FRAME_INPUT_DIR}/00001.png\"\n", " new_last_frame = f\"{FRAME_INPUT_DIR}/{frame_count.zfill(5)}.png\"\n", " shutil.copyfile(first_frame, new_last_frame)\n", "\n", "print(f\"{frame_count} frame PNGs generated.\")\n", "\n", "#Checking if PNGs do have alpha\n", "import subprocess as sp\n", "%cd {FRAME_INPUT_DIR}\n", "channels = sp.getoutput('identify -format %[channels] 00001.png')\n", "print (f\"{channels} detected\")\n", "\n", "# Removing alpha if detected\n", "if \"a\" in channels:\n", " print(\"Alpha channel detected and will be removed.\")\n", " print(sp.getoutput('find . -name \"*.png\" -exec convert \"{}\" -alpha off PNG24:\"{}\" \\;'))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "W3rrE7L824gL", "cellView": "form" }, "source": [ "#@title Interpolation\n", "%shell mkdir -p '{FRAME_OUTPUT_DIR}'\n", "%cd /content/DAIN\n", "\n", "!python -W ignore colab_interpolate.py --netName DAIN_slowmotion --time_step {fps/TARGET_FPS} --start_frame 1 --end_frame {frame_count} --frame_input_dir '{FRAME_INPUT_DIR}' --frame_output_dir '{FRAME_OUTPUT_DIR}'" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "TKREDli2IDMV", "cellView": "form" }, "source": [ "#@title Create output video\n", "%cd {FRAME_OUTPUT_DIR}\n", "%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n", "\n", "if(AUTO_REMOVE):\n", " !rm -rf {FRAME_OUTPUT_DIR}/*\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "UF5TEo5N374o", "cellView": "form" }, "source": [ "#@title [Experimental] Create video with sound\n", "# Only run this, if the original had sound.\n", "%cd {FRAME_OUTPUT_DIR}\n", "%shell ffmpeg -i '/content/DAIN/{filename}' -acodec copy output-audio.aac\n", "%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' -i output-audio.aac -shortest '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n", "\n", "if (AUTO_REMOVE):\n", " !rm -rf {FRAME_OUTPUT_DIR}/*\n", " !rm -rf output-audio.aac" ], "execution_count": null, "outputs": [] } ] } ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Wenbo Bao Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MegaDepth/LICENSE ================================================ MIT License Copyright (c) 2018 Zhengqi Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MegaDepth/MegaDepth_model.py ================================================ import torch import sys from torch.autograd import Variable import numpy as np from .options.train_options import TrainOptions from .models.models import create_model __all__ = ['HourGlass'] def HourGlass(pretrained=None): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch model = create_model(opt,pretrained) #netG is the real nn.Module return model.netG ================================================ FILE: MegaDepth/README.md ================================================ # MegaDepth: Learning Single-View Depth Prediction from Internet Photos This is a code of the algorithm described in "MegaDepth: Learning Single-View Depth Prediction from Internet Photos, Z. Li and N. Snavely, CVPR 2018". The code skeleton is based on "https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix". If you use our code or models for academic purposes, please consider citing: @inproceedings{MDLi18, title={MegaDepth: Learning Single-View Depth Prediction from Internet Photos}, author={Zhengqi Li and Noah Snavely}, booktitle={Computer Vision and Pattern Recognition (CVPR)}, year={2018} } #### Examples of single-view depth predictions on the photos we randomly downloaded from Internet: #### Dependencies: * The code was written in Pytorch 0.2 and Python 2.7, but it should be easy to adapt it to Python 3 and latest Pytorch version if needed. * You might need skimage, h5py libraries installed for python before running the code. #### Single-view depth prediction on any Internet photo: * Download pretrained model from: http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth and put it in "checkpoints/test_local/best_generalization_net_G.pth * In python file "models/HG_model.py", in init function, change to "model_parameters = self.load_network(model, 'G', 'best_generalization')" * run demo code ```bash python demo.py ``` You should see an inverse depth prediction saved as demo.png from an original photo demo.jpg. If you want to use RGB maps for visualization, like the figures in our paper, you have to install/run semantic segmentation from https://github.com/kazuto1011/pspnet-pytorch trained on ADE20K to mask out sky, because inconsistent depth prediction of unmasked sky will not make RGB visualization resonable. #### Evaluation on the MegaDepth test splits: * Download MegaDepth V1 dataset from project website: http://www.cs.cornell.edu/projects/megadepth/. * Download pretrained model (specific for MD dataset) from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_vanila_net_G.pth and put it in "checkpoints/test_local/best_vanila_net_G.pth" * Download test list files from http://www.cs.cornell.edu/projects/megadepth/dataset/data_lists/test_lists.tar.gz, it should include two folders corresponding to images with landscape and portrait orientations. * To compute scale invarance RMSE on MD testset, change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run: ```bash python rmse_error_main.py ``` * To compute Structure from Motion Disagreement Rate (SDR), change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run: ```bash python SDR_compute.py ``` * If you want to run our model on arbitrary Internet photos, please download pretrained model from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth, which has much better generalization ability (qualitatively speaking) to completely unknown scenes. ================================================ FILE: MegaDepth/SDR_compute.py ================================================ import time import torch import sys from options.train_options import TrainOptions opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch from data.data_loader import CreateDataLoader_TEST from models.models import create_model dataset_root = "/phoenix/S6/zl548/" test_list_dir_l = dataset_root + '/MegaDpeth_code/test_list/landscape/' input_height = 240 input_width = 320 test_data_loader_l = CreateDataLoader_TEST(dataset_root, test_list_dir_l, input_height, input_width) test_dataset_l = test_data_loader_l.load_data() test_dataset_size_l = len(test_data_loader_l) print('========================= test L images = %d' % test_dataset_size_l) test_list_dir_p = dataset_root + '/MegaDpeth_code/test_list/portrait/' input_height = 320 input_width = 240 test_data_loader_p = CreateDataLoader_TEST(dataset_root, test_list_dir_p, input_height, input_width) test_dataset_p = test_data_loader_p.load_data() test_dataset_size_p = len(test_data_loader_p) print('========================= test P images = %d' % test_dataset_size_p) model = create_model(opt) batch_size = 32 diw_index = 0 total_steps = 0 best_loss = 100 error_list = [0 , 0, 0] total_list = [0 , 0, 0] list_l = range(test_dataset_size_l) list_p = range(test_dataset_size_p) def test_SDR(model): total_loss =0 # count = 0 print("============================= TEST SDR============================") model.switch_to_eval() diw_index = 0 for i, data in enumerate(test_dataset_l): stacked_img = data['img_1'] targets = data['target_1'] error, samples = model.evaluate_SDR(stacked_img, targets) for j in range(0,3): error_list[j] += error[j] total_list[j] += samples[j] print("EQUAL ", error_list[0]/float(total_list[0])) print("INEQUAL ", error_list[1]/float(total_list[1])) print("TOTAL ",error_list[2]/float(total_list[2])) for i, data in enumerate(test_dataset_p): stacked_img = data['img_1'] targets = data['target_1'] error, samples = model.evaluate_SDR(stacked_img, targets) for j in range(0,3): error_list[j] += error[j] total_list[j] += samples[j] print("EQUAL ", error_list[0]/float(total_list[0])) print("INEQUAL ", error_list[1]/float(total_list[1])) print("TOTAL ",error_list[2]/float(total_list[2])) print("=========================================================SDR Summary =====================") print("Equal SDR:\t" , float(error_list[0])/ float(total_list[0])) print("Unequal SDR:\t" , float(error_list[1])/ float(total_list[1])) print("SDR:\t" , float(error_list[2])/ float(total_list[2])) print("WE ARE TESTING SDR!!!!") test_SDR(model) ================================================ FILE: MegaDepth/__init__.py ================================================ from .MegaDepth_model import * ================================================ FILE: MegaDepth/data/__init__.py ================================================ ================================================ FILE: MegaDepth/data/aligned_data_loader.py ================================================ import random import numpy as np import torch.utils.data from data.base_data_loader import BaseDataLoader from data.image_folder import ImageFolder from data.image_folder import ImageFolder_TEST from builtins import object import sys import h5py class PairedData(object): def __init__(self, data_loader, flip): self.data_loader = data_loader # self.fineSize = fineSize # self.max_dataset_size = max_dataset_size self.flip = flip self.data_loader_iter = iter(self.data_loader) self.iter = 0 def __iter__(self): self.data_loader_iter = iter(self.data_loader) self.iter = 0 return self def __next__(self): self.iter += 1 final_img, target_1 = next(self.data_loader_iter) return {'img_1': final_img, 'target_1': target_1} class AlignedDataLoader(BaseDataLoader): def __init__(self,_root, _list_dir, _input_height, _input_width, _is_flip, _shuffle): transform = None dataset = ImageFolder(root=_root, \ list_dir =_list_dir, input_height = _input_height, input_width = _input_width, transform=transform, is_flip = _is_flip) data_loader = torch.utils.data.DataLoader(dataset, batch_size= 16, shuffle= _shuffle, num_workers=int(3)) self.dataset = dataset flip = False self.paired_data = PairedData(data_loader, flip) def name(self): return 'RMSEDataLoader' def load_data(self): return self.paired_data def __len__(self): return len(self.dataset) class AlignedDataLoader_TEST(BaseDataLoader): def __init__(self,_root, _list_dir, _input_height, _input_width): dataset = ImageFolder_TEST(root=_root, \ list_dir =_list_dir, _input_height = _input_height, _input_width = _input_width) data_loader = torch.utils.data.DataLoader(dataset, batch_size= 1, shuffle= False, num_workers=int(3)) self.dataset = dataset flip = False self.paired_data = PairedData(data_loader, flip) def name(self): return 'TestSDRDataLoader' def load_data(self): return self.paired_data def __len__(self): return len(self.dataset) ================================================ FILE: MegaDepth/data/base_data_loader.py ================================================ class BaseDataLoader(): def __init__(self): pass # def initialize(self): # # self.opt = opt # pass def load_data(): return None ================================================ FILE: MegaDepth/data/data_loader.py ================================================ def CreateDataLoader(_root, _list_dir, _input_height, _input_width, is_flip = True, shuffle = True): data_loader = None from data.aligned_data_loader import AlignedDataLoader data_loader = AlignedDataLoader(_root, _list_dir, _input_height, _input_width, is_flip, shuffle) return data_loader def CreateDataLoader_TEST(_root, _list_dir, _input_height, _input_width): data_loader = None from data.aligned_data_loader import AlignedDataLoader_TEST data_loader = AlignedDataLoader_TEST(_root, _list_dir, _input_height, _input_width) return data_loader ================================================ FILE: MegaDepth/data/image_folder.py ================================================ ################################################################################ # Code from # https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py # Modified the original code so that it also loads images from the current # directory as well as the subdirectories ################################################################################ import h5py import torch.utils.data as data import pickle import numpy as np import torch import os, os.path import math, random import sys from skimage.transform import resize from skimage import io def make_dataset(list_dir): # subgroup_name1 = "/dataset/image_list/" file_name = list_dir + "imgs_MD.p" file_name_1 = open( file_name, "rb" ) images_list = pickle.load( file_name_1) file_name_1.close() file_name_t= list_dir + "targets_MD.p" file_name_2 = open( file_name_t, "rb" ) targets_list = pickle.load(file_name_2) file_name_2.close() return images_list, targets_list # test for si-RMSE class ImageFolder(data.Dataset): def __init__(self, root, list_dir, input_height, input_width, transform=None, loader=None, is_flip = True): # load image list from hdf5 img_list , targets_list = make_dataset(list_dir) if len(img_list) == 0: raise(RuntimeError("Found 0 images in: " + root + "\n" "Supported image extensions are: " + ",".join(IMG_EXTENSIONS))) # img_list_1, img_list_2 = selfshuffle_dataset(img_list) self.root = root self.list_dir = list_dir self.img_list = img_list self.targets_list = targets_list self.transform = transform # self.loader = loader self.input_height = input_height self.input_width = input_width self.is_flip = is_flip def load_MD(self, img_path, depth_path): MD_img = np.float32(io.imread(img_path))/255.0 hdf5_file_read = h5py.File(depth_path,'r') gt = hdf5_file_read.get('/depth') gt = np.array(gt) assert(gt.shape[0] == MD_img.shape[0]) assert(gt.shape[1] == MD_img.shape[1]) color_rgb = np.zeros((self.input_height,self.input_width,3)) MD_img = resize(MD_img, (self.input_height, self.input_width), order = 1) if len(MD_img.shape) == 2: color_rgb[:,:,0] = MD_img.copy() color_rgb[:,:,1] = MD_img.copy() color_rgb[:,:,2] = MD_img.copy() else: color_rgb = MD_img.copy() if np.sum(gt > 1e-8) > 10: gt[ gt > np.percentile(gt[gt > 1e-8], 98)] = 0 gt[ gt < np.percentile(gt[gt > 1e-8], 1)] = 0 max_depth = np.max(gt) + 1e-9 gt = gt/max_depth gt = resize(gt, (self.input_height, self.input_width), order = 0) gt = gt*max_depth mask = np.float32(gt > 1e-8) color_rgb = np.ascontiguousarray(color_rgb) gt = np.ascontiguousarray(gt) mask = np.ascontiguousarray(mask) hdf5_file_read.close() return color_rgb, gt, mask def __getitem__(self, index): # 00xx/1/ targets_1 = {} # targets_1['L'] = [] targets_1['path'] = [] img_path_suff = self.img_list[index] targets_path_suff = self.targets_list[index] img_path = self.root + "/MegaDepth_v1/" + img_path_suff depth_path = self.root + "/MegaDepth_v1/" + targets_path_suff img, gt, mask = self.load_MD(img_path, depth_path) gt[mask < 0.1] = 1.0 targets_1['path'] = targets_path_suff targets_1['gt_0'] = torch.from_numpy(gt).float() targets_1['mask_0'] = torch.from_numpy(mask).float() final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float() return final_img, targets_1 def __len__(self): return len(self.img_list) # Test for SDR class ImageFolder_TEST(data.Dataset): def __init__(self, root, list_dir, _input_height, _input_width): # load image list from hdf5 img_list , targets_list = make_dataset(list_dir) if len(img_list) == 0: raise(RuntimeError("Found 0 images in: " + root + "\n" "Supported image extensions are: " + ",".join(IMG_EXTENSIONS))) self.root = root self.list_dir = list_dir self.img_list = img_list self.input_height = _input_height self.input_width = _input_width self.half_window = 1 def load_SfM_ORD(self, img_path, targets_path): sfm_image = np.float32(io.imread(img_path))/255.0 resized_sfm_img = resize(sfm_image, (self.input_height, self.input_width), order = 1) color_rgb = np.zeros((self.input_height, self.input_width,3)) if len(sfm_image.shape) == 2: color_rgb[:,:,0] = resized_sfm_img.copy() color_rgb[:,:,1] = resized_sfm_img.copy() color_rgb[:,:,2] = resized_sfm_img.copy() else: color_rgb = resized_sfm_img.copy() if color_rgb.shape[2] == 4: return color_rgb, 0, 0 ,0, 0, 0 hdf5_file_read = h5py.File(targets_path,'r') gt = hdf5_file_read.get('/SfM_features') gt = np.array(gt) y_A = np.round( gt[0,:] * float(self.input_height) ) x_A = np.round( gt[1,:] * float(self.input_width) ) y_B = np.round( gt[2,:] * float(self.input_height) ) x_B = np.round( gt[3,:] * float(self.input_width) ) ord_ = gt[4,:] hdf5_file_read.close() return color_rgb, y_A, x_A ,y_B, x_B, ord_ def __getitem__(self, index): # 00xx/1/ targets_1 = {} # targets_1['L'] = [] targets_1['path'] = [] targets_1['sdr_xA'] = [] targets_1['sdr_yA'] = [] targets_1['sdr_xB'] = [] targets_1['sdr_yB'] = [] targets_1['sdr_gt'] = [] img_path_suff = self.img_list[index] img_path = self.root + "/MegaDepth_v1/" + img_path_suff folder_name = img_path_suff.split('/')[-4] img_name = img_path_suff.split('/')[-1] sparse_sift_path = self.root + "/sparse_features/" + folder_name + "/" + img_name + ".h5" # no sift features if not os.path.isfile(sparse_sift_path) or not os.path.isfile(img_path): img = np.zeros((self.input_height, self.input_width,3)) targets_1['has_SfM_feature'] = False else: img, y_A, x_A ,y_B, x_B, ordinal = self.load_SfM_ORD(img_path, sparse_sift_path) targets_1['sdr_xA'].append(torch.from_numpy(x_A).long()) targets_1['sdr_yA'].append(torch.from_numpy(y_A).long()) targets_1['sdr_xB'].append(torch.from_numpy(x_B).long()) targets_1['sdr_yB'].append(torch.from_numpy(y_B).long()) targets_1['sdr_gt'].append(torch.from_numpy(ordinal).float()) targets_1['has_SfM_feature'] = True final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float() return final_img, targets_1 def __len__(self): return len(self.img_list) ================================================ FILE: MegaDepth/models/HG_model.py ================================================ import numpy as np import torch import os from torch.autograd import Variable from .base_model import BaseModel import sys # import pytorch_DIW_scratch import MegaDepth.pytorch_DIW_scratch as pytorch_DIW_scratch class HGModel(BaseModel): def name(self): return 'HGModel' def __init__(self, opt,pretrained=None): BaseModel.initialize(self, opt) # print("===========================================LOADING Hourglass NETWORK====================================================") model = pytorch_DIW_scratch.pytorch_DIW_scratch # model_temp = model # model= torch.nn.parallel.DataParallel(model, device_ids = [0,1]) # model_parameters = self.load_network(model, 'G', 'best_vanila') if pretrained is None: # model_parameters = self.load_network(model, 'G', 'best_generalization') # # model.load_state_dict(model_parameters) # self.netG = model.cuda() self.netG = model # print("No weights loaded for Hourglass Network") else: pretrained_dict = torch.load(pretrained) model_dict = model.state_dict() # print(len(pretrained_dict)) # print(len(model_dict)) # 1. filter out unnecessary keys # the saved model contains a 'module.' prefix for the data.parallel reason pretrained_dict = {k[7:]: v for k, v in pretrained_dict.items()} # and not k[:10]== 'rectifyNet'} # print(str(len(pretrained_dict)) + " are updated") # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) pretrained_dict = None self.netG = model def batch_classify(self, z_A_arr, z_B_arr, ground_truth ): threashold = 1.1 depth_ratio = torch.div(z_A_arr, z_B_arr) depth_ratio = depth_ratio.cpu() estimated_labels = torch.zeros(depth_ratio.size(0)) estimated_labels[depth_ratio > (threashold)] = 1 estimated_labels[depth_ratio < (1/threashold)] = -1 diff = estimated_labels - ground_truth diff[diff != 0] = 1 # error inequal_error_count = diff[ground_truth != 0] inequal_error_count = torch.sum(inequal_error_count) error_count = torch.sum(diff) #diff[diff !=0] # error_count = error_count.size(0) equal_error_count = error_count - inequal_error_count # total total_count = depth_ratio.size(0) ground_truth[ground_truth !=0 ] = 1 inequal_count_total = torch.sum(ground_truth) equal_total_count = total_count - inequal_count_total error_list = [equal_error_count, inequal_error_count, error_count] count_list = [equal_total_count, inequal_count_total, total_count] return error_list, count_list def computeSDR(self, prediction_d, targets): # for each image total_error = [0,0,0] total_samples = [0,0,0] for i in range(0, prediction_d.size(0)): if targets['has_SfM_feature'][i] == False: continue x_A_arr = targets["sdr_xA"][i].squeeze(0) x_B_arr = targets["sdr_xB"][i].squeeze(0) y_A_arr = targets["sdr_yA"][i].squeeze(0) y_B_arr = targets["sdr_yB"][i].squeeze(0) predict_depth = torch.exp(prediction_d[i,:,:]) predict_depth = predict_depth.squeeze(0) ground_truth = targets["sdr_gt"][i] # print(x_A_arr.size()) # print(y_A_arr.size()) z_A_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_A_arr.cuda()) , 0, y_A_arr.view(1, -1).cuda())# predict_depth:index(2, x_A_arr):gather(1, y_A_arr:view(1, -1)) z_B_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_B_arr.cuda()) , 0, y_B_arr.view(1, -1).cuda()) z_A_arr = z_A_arr.squeeze(0) z_B_arr = z_B_arr.squeeze(0) error_list, count_list = self.batch_classify(z_A_arr, z_B_arr,ground_truth) for j in range(0,3): total_error[j] += error_list[j] total_samples[j] += count_list[j] return total_error, total_samples def evaluate_SDR(self, input_, targets): input_images = Variable(input_.cuda() ) prediction_d = self.netG.forward(input_images) total_error, total_samples = self.computeSDR(prediction_d.data, targets) return total_error, total_samples def rmse_Loss(self, log_prediction_d, mask, log_gt): N = torch.sum(mask) log_d_diff = log_prediction_d - log_gt log_d_diff = torch.mul(log_d_diff, mask) s1 = torch.sum( torch.pow(log_d_diff,2) )/N s2 = torch.pow(torch.sum(log_d_diff),2)/(N*N) data_loss = s1 - s2 data_loss = torch.sqrt(data_loss) return data_loss def evaluate_RMSE(self, input_images, prediction_d, targets): count = 0 total_loss = Variable(torch.cuda.FloatTensor(1)) total_loss[0] = 0 mask_0 = Variable(targets['mask_0'].cuda(), requires_grad = False) d_gt_0 = torch.log(Variable(targets['gt_0'].cuda(), requires_grad = False)) for i in range(0, mask_0.size(0)): total_loss += self.rmse_Loss(prediction_d[i,:,:], mask_0[i,:,:], d_gt_0[i,:,:]) count += 1 return total_loss.data[0], count def evaluate_sc_inv(self, input_, targets): input_images = Variable(input_.cuda() ) prediction_d = self.netG.forward(input_images) rmse_loss , count= self.evaluate_RMSE(input_images, prediction_d, targets) return rmse_loss, count def switch_to_train(self): self.netG.train() def switch_to_eval(self): self.netG.eval() ================================================ FILE: MegaDepth/models/__init__.py ================================================ ================================================ FILE: MegaDepth/models/base_model.py ================================================ import os import torch class BaseModel(): def name(self): return 'BaseModel' def initialize(self, opt): self.opt = opt self.gpu_ids = opt.gpu_ids self.isTrain = opt.isTrain self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) def set_input(self, input): self.input = input def forward(self): pass # used in test time, no backprop def test(self): pass def get_image_paths(self): pass def optimize_parameters(self): pass def get_current_visuals(self): return self.input def get_current_errors(self): return {} def save(self, label): pass # helper saving function that can be used by subclasses def save_network(self, network, network_label, epoch_label, gpu_ids): save_filename = '_%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) torch.save(network.cpu().state_dict(), save_path) if len(gpu_ids) and torch.cuda.is_available(): network.cuda(device_id=gpu_ids[0]) # helper loading function that can be used by subclasses def load_network(self, network, network_label, epoch_label): save_filename = '%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) print(save_path) model = torch.load(save_path) return model # network.load_state_dict(torch.load(save_path)) def update_learning_rate(): pass ================================================ FILE: MegaDepth/models/models.py ================================================ def create_model(opt,pretrained=None): model = None from .HG_model import HGModel model = HGModel(opt,pretrained) # print("model [%s] was created" % (model.name())) return model ================================================ FILE: MegaDepth/options/__init__.py ================================================ ================================================ FILE: MegaDepth/options/base_options.py ================================================ import argparse import os from ..util import util class BaseOptions(): def __init__(self): self.parser = argparse.ArgumentParser() self.initialized = False def initialize(self): # self.parser.add_argument('--dataroot', required=True, help='path to images (should have subfolders trainA, trainB, valA, valB, etc)') self.parser.add_argument('--batchSize', type=int, default=1, help='input batch size') self.parser.add_argument('--loadSize', type=int, default=286, help='scale images to this size') self.parser.add_argument('--fineSize', type=int, default=256, help='then crop to this size') self.parser.add_argument('--input_nc', type=int, default=3, help='# of input image channels') self.parser.add_argument('--output_nc', type=int, default=3, help='# of output image channels') self.parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in first conv layer') self.parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in first conv layer') # self.parser.add_argument('--which_model_netD', type=str, default='basic', help='selects model to use for netD') self.parser.add_argument('--which_model_netG', type=str, default='unet_256', help='selects model to use for netG') # self.parser.add_argument('--n_layers_D', type=int, default=3, help='only used if which_model_netD==n_layers') self.parser.add_argument('--gpu_ids', type=str, default='0,1', help='gpu ids: e.g. 0 0,1,2, 0,2') self.parser.add_argument('--name', type=str, default='test_local', help='name of the experiment. It decides where to store samples and models') # self.parser.add_argument('--align_data', action='store_true', # help='if True, the datasets are loaded from "test" and "train" directories and the data pairs are aligned') self.parser.add_argument('--model', type=str, default='pix2pix', help='chooses which model to use. cycle_gan, one_direction_test, pix2pix, ...') # self.parser.add_argument('--which_direction', type=str, default='AtoB', help='AtoB or BtoA') self.parser.add_argument('--nThreads', default=2, type=int, help='# threads for loading data') self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints/', help='models are saved here') self.parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization') self.parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly') self.parser.add_argument('--display_winsize', type=int, default=256, help='display window size') self.parser.add_argument('--display_id', type=int, default=1, help='window id of the web display') self.parser.add_argument('--identity', type=float, default=0.0, help='use identity mapping. Setting identity other than 1 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set optidentity = 0.1') self.parser.add_argument('--use_dropout', action='store_true', help='use dropout for the generator') self.parser.add_argument('--max_dataset_size', type=int, default=float("inf"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.') self.initialized = True def parse(self): if not self.initialized: self.initialize() self.opt = self.parser.parse_known_args()[0] #parse_args() self.opt.isTrain = self.isTrain # train or test str_ids = self.opt.gpu_ids.split(',') self.opt.gpu_ids = [] for str_id in str_ids: id = int(str_id) if id >= 0: self.opt.gpu_ids.append(id) args = vars(self.opt) # print('------------ Options -------------') # for k, v in sorted(args.items()): # print('%s: %s' % (str(k), str(v))) # print('-------------- End ----------------') # save to the disk expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.name) util.mkdirs(expr_dir) file_name = os.path.join(expr_dir, 'opt.txt') with open(file_name, 'wt') as opt_file: opt_file.write('------------ Options -------------\n') for k, v in sorted(args.items()): opt_file.write('%s: %s\n' % (str(k), str(v))) opt_file.write('-------------- End ----------------\n') return self.opt ================================================ FILE: MegaDepth/options/test_options.py ================================================ from .base_options import BaseOptions class TestOptions(BaseOptions): def initialize(self): BaseOptions.initialize(self) self.parser.add_argument('--ntest', type=int, default=float("inf"), help='# of test examples.') self.parser.add_argument('--results_dir', type=str, default='./results/', help='saves results here.') self.parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images') self.parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc') self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model') self.parser.add_argument('--how_many', type=int, default=50, help='how many test images to run') self.isTrain = False ================================================ FILE: MegaDepth/options/train_options.py ================================================ from .base_options import BaseOptions class TrainOptions(BaseOptions): def initialize(self): BaseOptions.initialize(self) self.parser.add_argument('--display_freq', type=int, default=100, help='frequency of showing training results on screen') self.parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console') self.parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results') self.parser.add_argument('--save_epoch_freq', type=int, default=5, help='frequency of saving checkpoints at the end of epochs') self.parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model') self.parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc') self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model') self.parser.add_argument('--niter', type=int, default=100, help='# of iter at starting learning rate') self.parser.add_argument('--niter_decay', type=int, default=100, help='# of iter to linearly decay learning rate to zero') self.parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam') self.parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate for adam') self.parser.add_argument('--no_lsgan', action='store_true', help='do *not* use least square GAN, if false, use vanilla GAN') self.parser.add_argument('--lambda_A', type=float, default=10.0, help='weight for cycle loss (A -> B -> A)') self.parser.add_argument('--lambda_B', type=float, default=10.0, help='weight for cycle loss (B -> A -> B)') self.parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images') self.parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/') self.parser.add_argument('--no_flip' , action='store_true', help='if specified, do not flip the images for data argumentation') # NOT-IMPLEMENTED self.parser.add_argument('--preprocessing', type=str, default='resize_and_crop', help='resizing/cropping strategy') self.isTrain = True ================================================ FILE: MegaDepth/pytorch_DIW_scratch.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from functools import reduce class LambdaBase(nn.Sequential): def __init__(self, fn, *args): super(LambdaBase, self).__init__(*args) self.lambda_func = fn def forward_prepare(self, input): output = [] for module in self._modules.values(): output.append(module(input)) return output if output else input class Lambda(LambdaBase): def forward(self, input): return self.lambda_func(self.forward_prepare(input)) class LambdaMap(LambdaBase): def forward(self, input): return list(map(self.lambda_func,self.forward_prepare(input))) class LambdaReduce(LambdaBase): def forward(self, input): return reduce(self.lambda_func,self.forward_prepare(input)) pytorch_DIW_scratch = nn.Sequential( # Sequential, nn.Conv2d(3,128,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(128), nn.ReLU(), nn.Sequential( # Sequential, LambdaMap(lambda x: x, # ConcatTable, nn.Sequential( # Sequential, nn.MaxPool2d((2, 2),(2, 2)), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), nn.Sequential( # Sequential, LambdaMap(lambda x: x, # ConcatTable, nn.Sequential( # Sequential, nn.MaxPool2d((2, 2),(2, 2)), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), nn.Sequential( # Sequential, LambdaMap(lambda x: x, # ConcatTable, nn.Sequential( # Sequential, LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), ), nn.Sequential( # Sequential, nn.AvgPool2d((2, 2),(2, 2)), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), nn.Sequential( # Sequential, LambdaMap(lambda x: x, # ConcatTable, nn.Sequential( # Sequential, LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), ), nn.Sequential( # Sequential, nn.AvgPool2d((2, 2),(2, 2)), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), nn.UpsamplingNearest2d(scale_factor=2), ), ), LambdaReduce(lambda x,y: x+y), # CAddTable, ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), nn.UpsamplingNearest2d(scale_factor=2), ), ), LambdaReduce(lambda x,y: x+y), # CAddTable, ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(256,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), nn.UpsamplingNearest2d(scale_factor=2), ), nn.Sequential( # Sequential, LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(11, 11),(1, 1),(5, 5)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), ), ), LambdaReduce(lambda x,y: x+y), # CAddTable, ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(5, 5),(1, 1),(2, 2)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), ), ), LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,16,(1, 1)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,16,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,16,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,32,(1, 1)), nn.BatchNorm2d(32,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(32,16,(11, 11),(1, 1),(5, 5)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), ), nn.UpsamplingNearest2d(scale_factor=2), ), nn.Sequential( # Sequential, LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat, nn.Sequential( # Sequential, nn.Conv2d(128,16,(1, 1)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,16,(3, 3),(1, 1),(1, 1)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,16,(7, 7),(1, 1),(3, 3)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), nn.Sequential( # Sequential, nn.Conv2d(128,64,(1, 1)), nn.BatchNorm2d(64,1e-05,0.1,False), nn.ReLU(), nn.Conv2d(64,16,(11, 11),(1, 1),(5, 5)), nn.BatchNorm2d(16,1e-05,0.1,False), nn.ReLU(), ), ), ), ), LambdaReduce(lambda x,y: x+y), # CAddTable, ), nn.Conv2d(64,1,(3, 3),(1, 1),(1, 1)), ) ================================================ FILE: MegaDepth/rmse_error_main.py ================================================ import time import torch import sys from options.train_options import TrainOptions opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch from data.data_loader import CreateDataLoader from models.models import create_model dataset_root = "/phoenix/S6/zl548/" test_list_dir_l = '/phoenix/S6/zl548/MegaDpeth_code/test_list/landscape/' input_height = 240 input_width = 320 is_flipped = False shuffle = False test_data_loader_l = CreateDataLoader(dataset_root, test_list_dir_l, input_height, input_width, is_flipped, shuffle) test_dataset_l = test_data_loader_l.load_data() test_dataset_size_l = len(test_data_loader_l) print('========================= test images = %d' % test_dataset_size_l) test_list_dir_p = '/phoenix/S6/zl548/MegaDpeth_code/test_list/portrait/' input_height = 320 input_width = 240 test_data_loader_p = CreateDataLoader(dataset_root, test_list_dir_p, input_height, input_width, is_flipped, shuffle) test_dataset_p = test_data_loader_p.load_data() test_dataset_size_p = len(test_data_loader_p) print('========================= test images = %d' % test_dataset_size_p) model = create_model(opt) def test(model): total_loss =0 toal_count = 0 print("============================= TEST ============================") model.switch_to_eval() for i, data in enumerate(test_dataset_l): stacked_img = data['img_1'] targets = data['target_1'] rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets) total_loss += rmse_loss toal_count += count print('RMSE loss is', total_loss/float(toal_count)) for i, data in enumerate(test_dataset_p): stacked_img = data['img_1'] targets = data['target_1'] rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets) total_loss += rmse_loss toal_count += count print('RMSE loss is', total_loss/float(toal_count)) print('average RMSE loss is', total_loss/float(toal_count)) print("WE ARE IN TESTING RMSE!!!!") test(model) print("WE ARE DONE TESTING!!!") print("We are done") ================================================ FILE: MegaDepth/util/__init__.py ================================================ ================================================ FILE: MegaDepth/util/html.py ================================================ import dominate from dominate.tags import * import os class HTML: def __init__(self, web_dir, title, reflesh=0): self.title = title self.web_dir = web_dir self.img_dir = os.path.join(self.web_dir, 'images') if not os.path.exists(self.web_dir): os.makedirs(self.web_dir) if not os.path.exists(self.img_dir): os.makedirs(self.img_dir) # print(self.img_dir) self.doc = dominate.document(title=title) if reflesh > 0: with self.doc.head: meta(http_equiv="reflesh", content=str(reflesh)) def get_image_dir(self): return self.img_dir def add_header(self, str): with self.doc: h3(str) def add_table(self, border=1): self.t = table(border=border, style="table-layout: fixed;") self.doc.add(self.t) def add_images(self, ims, txts, links, width=400): self.add_table() with self.t: with tr(): for im, txt, link in zip(ims, txts, links): with td(style="word-wrap: break-word;", halign="center", valign="top"): with p(): with a(href=os.path.join('images', link)): img(style="width:%dpx" % width, src=os.path.join('images', im)) br() p(txt) def save(self): html_file = '%s/index.html' % self.web_dir f = open(html_file, 'wt') f.write(self.doc.render()) f.close() if __name__ == '__main__': html = HTML('web/', 'test_html') html.add_header('hello world') ims = [] txts = [] links = [] for n in range(4): ims.append('image_%d.png' % n) txts.append('text_%d' % n) links.append('image_%d.png' % n) html.add_images(ims, txts, links) html.save() ================================================ FILE: MegaDepth/util/image_pool.py ================================================ import random import numpy as np import torch from pdb import set_trace as st from torch.autograd import Variable class ImagePool(): def __init__(self, pool_size): self.pool_size = pool_size if self.pool_size > 0: self.num_imgs = 0 self.images = [] def query(self, images): if self.pool_size == 0: return images return_images = [] for image in images.data: image = torch.unsqueeze(image, 0) if self.num_imgs < self.pool_size: self.num_imgs = self.num_imgs + 1 self.images.append(image) return_images.append(image) else: p = random.uniform(0, 1) if p > 0.5: random_id = random.randint(0, self.pool_size-1) tmp = self.images[random_id].clone() self.images[random_id] = image return_images.append(tmp) else: return_images.append(image) return_images = Variable(torch.cat(return_images, 0)) return return_images ================================================ FILE: MegaDepth/util/png.py ================================================ import struct import zlib def encode(buf, width, height): """ buf: must be bytes or a bytearray in py3, a regular string in py2. formatted RGBRGB... """ assert (width * height * 3 == len(buf)) bpp = 3 def raw_data(): # reverse the vertical line order and add null bytes at the start row_bytes = width * bpp for row_start in range((height - 1) * width * bpp, -1, -row_bytes): yield b'\x00' yield buf[row_start:row_start + row_bytes] def chunk(tag, data): return [ struct.pack("!I", len(data)), tag, data, struct.pack("!I", 0xFFFFFFFF & zlib.crc32(data, zlib.crc32(tag))) ] SIGNATURE = b'\x89PNG\r\n\x1a\n' COLOR_TYPE_RGB = 2 COLOR_TYPE_RGBA = 6 bit_depth = 8 return b''.join( [ SIGNATURE ] + chunk(b'IHDR', struct.pack("!2I5B", width, height, bit_depth, COLOR_TYPE_RGB, 0, 0, 0)) + chunk(b'IDAT', zlib.compress(b''.join(raw_data()), 9)) + chunk(b'IEND', b'') ) ================================================ FILE: MegaDepth/util/util.py ================================================ from __future__ import print_function import torch import numpy as np from PIL import Image import inspect, re import numpy as np import os import collections # Converts a Tensor into a Numpy array # |imtype|: the desired type of the converted numpy array def tensor2im(image_tensor, imtype=np.uint8): image_numpy = image_tensor[0].cpu().float().numpy() image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0 return image_numpy.astype(imtype) def diagnose_network(net, name='network'): mean = 0.0 count = 0 for param in net.parameters(): if param.grad is not None: mean += torch.mean(torch.abs(param.grad.data)) count += 1 if count > 0: mean = mean / count print(name) print(mean) def save_image(image_numpy, image_path): image_pil = Image.fromarray(image_numpy) image_pil.save(image_path) def info(object, spacing=10, collapse=1): """Print methods and doc strings. Takes module, class, list, dictionary, or string.""" methodList = [e for e in dir(object) if isinstance(getattr(object, e), collections.Callable)] processFunc = collapse and (lambda s: " ".join(s.split())) or (lambda s: s) print( "\n".join(["%s %s" % (method.ljust(spacing), processFunc(str(getattr(object, method).__doc__))) for method in methodList]) ) def varname(p): for line in inspect.getframeinfo(inspect.currentframe().f_back)[3]: m = re.search(r'\bvarname\s*\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)', line) if m: return m.group(1) def print_numpy(x, val=True, shp=False): x = x.astype(np.float64) if shp: print('shape,', x.shape) if val: x = x.flatten() print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % ( np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x))) def mkdirs(paths): if isinstance(paths, list) and not isinstance(paths, str): for path in paths: mkdir(path) else: mkdir(paths) def mkdir(path): if not os.path.exists(path): os.makedirs(path) ================================================ FILE: MegaDepth/util/visualizer.py ================================================ import numpy as np import os import ntpath import time from . import util from . import html class Visualizer(): def __init__(self, opt): # self.opt = opt self.display_id = opt.display_id self.use_html = opt.isTrain and not opt.no_html self.win_size = opt.display_winsize self.name = opt.name if self.display_id > 0: import visdom self.vis = visdom.Visdom() if self.use_html: self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web') self.img_dir = os.path.join(self.web_dir, 'images') print('create web directory %s...' % self.web_dir) util.mkdirs([self.web_dir, self.img_dir]) # |visuals|: dictionary of images to display or save def display_current_results(self, visuals, epoch): if self.display_id > 0: # show images in the browser idx = 1 for label, image_numpy in visuals.items(): #image_numpy = np.flipud(image_numpy) self.vis.image(image_numpy.transpose([2,0,1]), opts=dict(title=label), win=self.display_id + idx) idx += 1 if self.use_html: # save images to a html file for label, image_numpy in visuals.items(): img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label)) util.save_image(image_numpy, img_path) # update website webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, reflesh=1) for n in range(epoch, 0, -1): webpage.add_header('epoch [%d]' % n) ims = [] txts = [] links = [] for label, image_numpy in visuals.items(): img_path = 'epoch%.3d_%s.png' % (n, label) ims.append(img_path) txts.append(label) links.append(img_path) webpage.add_images(ims, txts, links, width=self.win_size) webpage.save() # errors: dictionary of error labels and values def plot_current_errors(self, epoch, counter_ratio, opt, errors): if not hasattr(self, 'plot_data'): self.plot_data = {'X':[],'Y':[], 'legend':list(errors.keys())} self.plot_data['X'].append(epoch + counter_ratio) self.plot_data['Y'].append([errors[k] for k in self.plot_data['legend']]) self.vis.line( X=np.stack([np.array(self.plot_data['X'])]*len(self.plot_data['legend']),1), Y=np.array(self.plot_data['Y']), opts={ 'title': self.name + ' loss over time', 'legend': self.plot_data['legend'], 'xlabel': 'epoch', 'ylabel': 'loss'}, win=self.display_id) # errors: same format as |errors| of plotCurrentErrors def print_current_errors(self, epoch, i, errors, t): message = '(epoch: %d, iters: %d, time: %.3f) ' % (epoch, i, t) for k, v in errors.items(): message += '%s: %.3f ' % (k, v) print(message) # save image to the disk def save_images(self, webpage, visuals, image_path): image_dir = webpage.get_image_dir() short_path = ntpath.basename(image_path[0]) name = os.path.splitext(short_path)[0] webpage.add_header(name) ims = [] txts = [] links = [] for label, image_numpy in visuals.items(): image_name = '%s_%s.png' % (name, label) save_path = os.path.join(image_dir, image_name) util.save_image(image_numpy, save_path) ims.append(image_name) txts.append(label) links.append(image_name) webpage.add_images(ims, txts, links, width=self.win_size) ================================================ FILE: PWCNet/PWCNet.py ================================================ """ implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018 Jinwei Gu and Zhile Ren """ import torch import torch.nn as nn from torch.autograd import Variable import os os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory #from .correlation_package.modules.corr import Correlation # from PWCNet.correlation_package_pytorch0_4.correlation import Correlation #pytorch0.4 version from PWCNet.correlation_package_pytorch1_0.correlation import Correlation #pytorch0.4 version import numpy as np __all__ = [ 'pwc_dc_net', 'pwc_dc_net_old' ] def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1): return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True), nn.LeakyReLU(0.1)) def predict_flow(in_planes): return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True) def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1): return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True) import time class PWCDCNet(nn.Module): """ PWC-DC net. add dilation convolution and densenet connections """ def __init__(self, md=4): """ input: md --- maximum displacement (for correlation. default: 4), after warpping """ super(PWCDCNet,self).__init__() self.conv1a = conv(3, 16, kernel_size=3, stride=2) self.conv1aa = conv(16, 16, kernel_size=3, stride=1) self.conv1b = conv(16, 16, kernel_size=3, stride=1) self.conv2a = conv(16, 32, kernel_size=3, stride=2) self.conv2aa = conv(32, 32, kernel_size=3, stride=1) self.conv2b = conv(32, 32, kernel_size=3, stride=1) self.conv3a = conv(32, 64, kernel_size=3, stride=2) self.conv3aa = conv(64, 64, kernel_size=3, stride=1) self.conv3b = conv(64, 64, kernel_size=3, stride=1) self.conv4a = conv(64, 96, kernel_size=3, stride=2) self.conv4aa = conv(96, 96, kernel_size=3, stride=1) self.conv4b = conv(96, 96, kernel_size=3, stride=1) self.conv5a = conv(96, 128, kernel_size=3, stride=2) self.conv5aa = conv(128,128, kernel_size=3, stride=1) self.conv5b = conv(128,128, kernel_size=3, stride=1) self.conv6aa = conv(128,196, kernel_size=3, stride=2) self.conv6a = conv(196,196, kernel_size=3, stride=1) self.conv6b = conv(196,196, kernel_size=3, stride=1) self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1) self.leakyRELU = nn.LeakyReLU(0.1) nd = (2*md+1)**2 dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int) dd = [int(d) for d in dd] od = nd self.conv6_0 = conv(od, 128, kernel_size=3, stride=1) self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow6 = predict_flow(od+dd[4]) self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+128+4 self.conv5_0 = conv(od, 128, kernel_size=3, stride=1) self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow5 = predict_flow(od+dd[4]) self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+96+4 self.conv4_0 = conv(od, 128, kernel_size=3, stride=1) self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow4 = predict_flow(od+dd[4]) self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+64+4 self.conv3_0 = conv(od, 128, kernel_size=3, stride=1) self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow3 = predict_flow(od+dd[4]) self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+32+4 self.conv2_0 = conv(od, 128, kernel_size=3, stride=1) self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow2 = predict_flow(od+dd[4]) self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv7 = predict_flow(32) for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal_(m.weight.data, mode='fan_in') if m.bias is not None: m.bias.data.zero_() W_MAX = 2048 H_MAX = 1024 B_MAX = 3 xx = torch.arange(0, W_MAX).view(1,-1).cuda().repeat(H_MAX,1) yy = torch.arange(0, H_MAX).view(-1,1).cuda().repeat(1,W_MAX) xx = xx.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1) yy = yy.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1) grid = torch.cat((xx,yy),1).float() ## for saving time on allocating a grid in forward self.W_MAX = W_MAX self.H_MAX = H_MAX self.B_MAX = B_MAX self.grid = Variable(grid, requires_grad=False) # self.mask_base = Variable(torch.cuda.FloatTensor().resize_(B_MAX,).zero_() + 1) def warp(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() # mesh grid # xx = torch.arange(0, W).view(1,-1).cuda().repeat(H,1) # yy = torch.arange(0, H).view(-1,1).cuda().repeat(1,W) # xx = xx.view(1,1,H,W).repeat(B,1,1,1) # yy = yy.view(1,1,H,W).repeat(B,1,1,1) # grid = torch.cat((xx,yy),1).float() # # if x.is_cuda: # # grid = grid.cuda() # vgrid = Variable(grid) + flo assert(B <= self.B_MAX and H <= self.H_MAX and W <= self.W_MAX) vgrid = self.grid[:B,:,:H,:W] +flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone()/max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone()/max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) output = nn.functional.grid_sample(x, vgrid) # mask = torch.autograd.Variable(torch.ones(x.size())).cuda() mask = torch.autograd.Variable(torch.cuda.FloatTensor().resize_(x.size()).zero_() + 1, requires_grad = False) mask = nn.functional.grid_sample(mask, vgrid) # if W==128: # np.save('mask.npy', mask.cpu().data.numpy()) # np.save('warp.npy', output.cpu().data.numpy()) mask[mask<0.9999] = 0 mask[mask>0] = 1 return output*mask def forward(self,x, output_more = False): im1 = x[:,:3,:,:] im2 = x[:,3:,:,:] # print("\n\n***************************PWC Net details *************** \n\n") # start= time.time() c11 = self.conv1b(self.conv1aa(self.conv1a(im1))) c21 = self.conv1b(self.conv1aa(self.conv1a(im2))) c12 = self.conv2b(self.conv2aa(self.conv2a(c11))) c22 = self.conv2b(self.conv2aa(self.conv2a(c21))) c13 = self.conv3b(self.conv3aa(self.conv3a(c12))) c23 = self.conv3b(self.conv3aa(self.conv3a(c22))) c14 = self.conv4b(self.conv4aa(self.conv4a(c13))) c24 = self.conv4b(self.conv4aa(self.conv4a(c23))) c15 = self.conv5b(self.conv5aa(self.conv5a(c14))) c25 = self.conv5b(self.conv5aa(self.conv5a(c24))) c16 = self.conv6b(self.conv6a(self.conv6aa(c15))) c26 = self.conv6b(self.conv6a(self.conv6aa(c25))) # print("features " +str(time.time()- start)) # start= time.time() corr6 = self.corr(c16, c26) corr6 = self.leakyRELU(corr6) x = torch.cat((self.conv6_0(corr6), corr6),1) x = torch.cat((self.conv6_1(x), x),1) x = torch.cat((self.conv6_2(x), x),1) x = torch.cat((self.conv6_3(x), x),1) x = torch.cat((self.conv6_4(x), x),1) flow6 = self.predict_flow6(x) up_flow6 = self.deconv6(flow6) up_feat6 = self.upfeat6(x) # print("level6 " +str(time.time()- start)) # start= time.time() warp5 = self.warp(c25, up_flow6*0.625) # print("level5_1 " + str(time.time() - start)) # start5 = time.time() corr5 = self.corr(c15, warp5) # print("level5_2 " + str(time.time() - start5)) # start5 = time.time() corr5 = self.leakyRELU(corr5) x = torch.cat((corr5, c15, up_flow6, up_feat6), 1) x = torch.cat((self.conv5_0(x), x),1) x = torch.cat((self.conv5_1(x), x),1) x = torch.cat((self.conv5_2(x), x),1) x = torch.cat((self.conv5_3(x), x),1) x = torch.cat((self.conv5_4(x), x),1) flow5 = self.predict_flow5(x) up_flow5 = self.deconv5(flow5) up_feat5 = self.upfeat5(x) # print("level5_3 " + str(time.time() - start5)) # print("level5 " + str(time.time() - start)) # start = time.time() warp4 = self.warp(c24, up_flow5*1.25) corr4 = self.corr(c14, warp4) corr4 = self.leakyRELU(corr4) x = torch.cat((corr4, c14, up_flow5, up_feat5), 1) x = torch.cat((self.conv4_0(x), x),1) x = torch.cat((self.conv4_1(x), x),1) x = torch.cat((self.conv4_2(x), x),1) x = torch.cat((self.conv4_3(x), x),1) x = torch.cat((self.conv4_4(x), x),1) flow4 = self.predict_flow4(x) up_flow4 = self.deconv4(flow4) up_feat4 = self.upfeat4(x) # print("level4 " + str(time.time() - start)) # start = time.time() warp3 = self.warp(c23, up_flow4*2.5) corr3 = self.corr(c13, warp3) corr3 = self.leakyRELU(corr3) x = torch.cat((corr3, c13, up_flow4, up_feat4), 1) x = torch.cat((self.conv3_0(x), x),1) x = torch.cat((self.conv3_1(x), x),1) x = torch.cat((self.conv3_2(x), x),1) x = torch.cat((self.conv3_3(x), x),1) x = torch.cat((self.conv3_4(x), x),1) flow3 = self.predict_flow3(x) up_flow3 = self.deconv3(flow3) up_feat3 = self.upfeat3(x) # print("level3 " + str(time.time() - start)) # start = time.time() warp2 = self.warp(c22, up_flow3*5.0) corr2 = self.corr(c12, warp2) corr2 = self.leakyRELU(corr2) x = torch.cat((corr2, c12, up_flow3, up_feat3), 1) x = torch.cat((self.conv2_0(x), x),1) x = torch.cat((self.conv2_1(x), x),1) x = torch.cat((self.conv2_2(x), x),1) x = torch.cat((self.conv2_3(x), x),1) x = torch.cat((self.conv2_4(x), x),1) flow2 = self.predict_flow2(x) # print("level2 " + str(time.time() - start)) # start = time.time() x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x)))) flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x))) # print("refine " + str(time.time() - start)) # start = time.time() # we don't have the gt for flow, we just fine tune it on flownets if not output_more: return flow2 else: return [flow2,flow3,flow4,flow5,flow6] # if self.training: # return flow2,flow3,flow4,flow5,flow6 # else: # return flow2 class PWCDCNet_old(nn.Module): """ PWC-DC net. add dilation convolution and densenet connections """ def __init__(self, md=4): """ input: md --- maximum displacement (for correlation. default: 4), after warpping """ super(PWCDCNet_old,self).__init__() self.conv1a = conv(3, 16, kernel_size=3, stride=2) self.conv1b = conv(16, 16, kernel_size=3, stride=1) self.conv2a = conv(16, 32, kernel_size=3, stride=2) self.conv2b = conv(32, 32, kernel_size=3, stride=1) self.conv3a = conv(32, 64, kernel_size=3, stride=2) self.conv3b = conv(64, 64, kernel_size=3, stride=1) self.conv4a = conv(64, 96, kernel_size=3, stride=2) self.conv4b = conv(96, 96, kernel_size=3, stride=1) self.conv5a = conv(96, 128, kernel_size=3, stride=2) self.conv5b = conv(128,128, kernel_size=3, stride=1) self.conv6a = conv(128,196, kernel_size=3, stride=2) self.conv6b = conv(196,196, kernel_size=3, stride=1) self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1) self.leakyRELU = nn.LeakyReLU(0.1) nd = (2*md+1)**2 dd = np.cumsum([128,128,96,64,32]) od = nd self.conv6_0 = conv(od, 128, kernel_size=3, stride=1) self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow6 = predict_flow(od+dd[4]) self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+128+4 self.conv5_0 = conv(od, 128, kernel_size=3, stride=1) self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow5 = predict_flow(od+dd[4]) self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+96+4 self.conv4_0 = conv(od, 128, kernel_size=3, stride=1) self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow4 = predict_flow(od+dd[4]) self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+64+4 self.conv3_0 = conv(od, 128, kernel_size=3, stride=1) self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow3 = predict_flow(od+dd[4]) self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+32+4 self.conv2_0 = conv(od, 128, kernel_size=3, stride=1) self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow2 = predict_flow(od+dd[4]) self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv7 = predict_flow(32) for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal(m.weight.data, mode='fan_in') if m.bias is not None: m.bias.data.zero_() def warp(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() # mesh grid xx = torch.arange(0, W).view(1,-1).repeat(H,1) yy = torch.arange(0, H).view(-1,1).repeat(1,W) xx = xx.view(1,1,H,W).repeat(B,1,1,1) yy = yy.view(1,1,H,W).repeat(B,1,1,1) grid = torch.cat((xx,yy),1).float() if x.is_cuda: grid = grid.cuda() vgrid = Variable(grid) + flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) output = nn.functional.grid_sample(x, vgrid) mask = torch.autograd.Variable(torch.ones(x.size())).cuda() mask = nn.functional.grid_sample(mask, vgrid) mask[mask<0.999] = 0 mask[mask>0] = 1 return output*mask def forward(self,x): im1 = x[:,:3,:,:] im2 = x[:,3:,:,:] c11 = self.conv1b(self.conv1a(im1)) c21 = self.conv1b(self.conv1a(im2)) c12 = self.conv2b(self.conv2a(c11)) c22 = self.conv2b(self.conv2a(c21)) c13 = self.conv3b(self.conv3a(c12)) c23 = self.conv3b(self.conv3a(c22)) c14 = self.conv4b(self.conv4a(c13)) c24 = self.conv4b(self.conv4a(c23)) c15 = self.conv5b(self.conv5a(c14)) c25 = self.conv5b(self.conv5a(c24)) c16 = self.conv6b(self.conv6a(c15)) c26 = self.conv6b(self.conv6a(c25)) corr6 = self.corr(c16, c26) corr6 = self.leakyRELU(corr6) x = torch.cat((corr6, self.conv6_0(corr6)),1) x = torch.cat((self.conv6_1(x), x),1) x = torch.cat((x, self.conv6_2(x)),1) x = torch.cat((x, self.conv6_3(x)),1) x = torch.cat((x, self.conv6_4(x)),1) flow6 = self.predict_flow6(x) up_flow6 = self.deconv6(flow6) up_feat6 = self.upfeat6(x) warp5 = self.warp(c25, up_flow6*0.625) corr5 = self.corr(c15, warp5) corr5 = self.leakyRELU(corr5) x = torch.cat((corr5, c15, up_flow6, up_feat6), 1) x = torch.cat((x, self.conv5_0(x)),1) x = torch.cat((self.conv5_1(x), x),1) x = torch.cat((x, self.conv5_2(x)),1) x = torch.cat((x, self.conv5_3(x)),1) x = torch.cat((x, self.conv5_4(x)),1) flow5 = self.predict_flow5(x) up_flow5 = self.deconv5(flow5) up_feat5 = self.upfeat5(x) warp4 = self.warp(c24, up_flow5*1.25) corr4 = self.corr(c14, warp4) corr4 = self.leakyRELU(corr4) x = torch.cat((corr4, c14, up_flow5, up_feat5), 1) x = torch.cat((x, self.conv4_0(x)),1) x = torch.cat((self.conv4_1(x), x),1) x = torch.cat((x, self.conv4_2(x)),1) x = torch.cat((x, self.conv4_3(x)),1) x = torch.cat((x, self.conv4_4(x)),1) flow4 = self.predict_flow4(x) up_flow4 = self.deconv4(flow4) up_feat4 = self.upfeat4(x) warp3 = self.warp(c23, up_flow4*2.5) corr3 = self.corr(c13, warp3) corr3 = self.leakyRELU(corr3) x = torch.cat((corr3, c13, up_flow4, up_feat4), 1) x = torch.cat((x, self.conv3_0(x)),1) x = torch.cat((self.conv3_1(x), x),1) x = torch.cat((x, self.conv3_2(x)),1) x = torch.cat((x, self.conv3_3(x)),1) x = torch.cat((x, self.conv3_4(x)),1) flow3 = self.predict_flow3(x) up_flow3 = self.deconv3(flow3) up_feat3 = self.upfeat3(x) warp2 = self.warp(c22, up_flow3*5.0) corr2 = self.corr(c12, warp2) corr2 = self.leakyRELU(corr2) x = torch.cat((corr2, c12, up_flow3, up_feat3), 1) x = torch.cat((x, self.conv2_0(x)),1) x = torch.cat((self.conv2_1(x), x),1) x = torch.cat((x, self.conv2_2(x)),1) x = torch.cat((x, self.conv2_3(x)),1) x = torch.cat((x, self.conv2_4(x)),1) flow2 = self.predict_flow2(x) x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x)))) flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x))) if self.training: return flow2,flow3,flow4,flow5,flow6 else: return flow2 def pwc_dc_net(path=None): model = PWCDCNet() if path is not None: data = torch.load(path) if 'state_dict' in data.keys(): model.load_state_dict(data['state_dict']) else: model.load_state_dict(data) return model def pwc_dc_net_old(path=None): model = PWCDCNet_old() if path is not None: data = torch.load(path) if 'state_dict' in data.keys(): model.load_state_dict(data['state_dict']) else: model.load_state_dict(data) return model ================================================ FILE: PWCNet/__init__.py ================================================ from .PWCNet import * ================================================ FILE: PWCNet/correlation_package_pytorch1_0/__init__.py ================================================ ================================================ FILE: PWCNet/correlation_package_pytorch1_0/build.sh ================================================ #!/usr/bin/env bash echo "Need pytorch>=1.0.0" source activate pytorch1.0.0 export PYTHONPATH=$PYTHONPATH:$(pwd)/../../my_package rm -rf build *.egg-info dist python setup.py install ================================================ FILE: PWCNet/correlation_package_pytorch1_0/clean.sh ================================================ #!/usr/bin/env bash echo "Need pytorch>=1.0.0" source activate pytorch1.0.0 rm -rf build *.egg-info dist #python setup.py install ================================================ FILE: PWCNet/correlation_package_pytorch1_0/correlation.py ================================================ import torch from torch.nn.modules.module import Module from torch.autograd import Function import correlation_cuda class CorrelationFunction(Function): def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1): super(CorrelationFunction, self).__init__() self.pad_size = pad_size self.kernel_size = kernel_size self.max_displacement = max_displacement self.stride1 = stride1 self.stride2 = stride2 self.corr_multiply = corr_multiply # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1) def forward(self, input1, input2): self.save_for_backward(input1, input2) with torch.cuda.device_of(input1): rbot1 = input1.new() rbot2 = input2.new() output = input1.new() correlation_cuda.forward(input1, input2, rbot1, rbot2, output, self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply) return output def backward(self, grad_output): input1, input2 = self.saved_tensors with torch.cuda.device_of(input1): rbot1 = input1.new() rbot2 = input2.new() grad_input1 = input1.new() grad_input2 = input2.new() correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2, self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply) return grad_input1, grad_input2 class Correlation(Module): def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1): super(Correlation, self).__init__() self.pad_size = pad_size self.kernel_size = kernel_size self.max_displacement = max_displacement self.stride1 = stride1 self.stride2 = stride2 self.corr_multiply = corr_multiply def forward(self, input1, input2): result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2) return result ================================================ FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "correlation_cuda_kernel.cuh" int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply) { int batchSize = input1.size(0); int nInputChannels = input1.size(1); int inputHeight = input1.size(2); int inputWidth = input1.size(3); int kernel_radius = (kernel_size - 1) / 2; int border_radius = kernel_radius + max_displacement; int paddedInputHeight = inputHeight + 2 * pad_size; int paddedInputWidth = inputWidth + 2 * pad_size; int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1); int outputHeight = ceil(static_cast(paddedInputHeight - 2 * border_radius) / static_cast(stride1)); int outputwidth = ceil(static_cast(paddedInputWidth - 2 * border_radius) / static_cast(stride1)); rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels}); rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels}); output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth}); rInput1.fill_(0); rInput2.fill_(0); output.fill_(0); int success = correlation_forward_cuda_kernel( output, output.size(0), output.size(1), output.size(2), output.size(3), output.stride(0), output.stride(1), output.stride(2), output.stride(3), input1, input1.size(1), input1.size(2), input1.size(3), input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3), input2, input2.size(1), input2.stride(0), input2.stride(1), input2.stride(2), input2.stride(3), rInput1, rInput2, pad_size, kernel_size, max_displacement, stride1, stride2, corr_type_multiply, // at::globalContext().getCurrentCUDAStream() //works for 0.4.1 at::cuda::getCurrentCUDAStream() //works for 1.0.0 ); //check for errors if (!success) { AT_ERROR("CUDA call failed"); } return 1; } int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, at::Tensor& gradInput1, at::Tensor& gradInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply) { int batchSize = input1.size(0); int nInputChannels = input1.size(1); int paddedInputHeight = input1.size(2)+ 2 * pad_size; int paddedInputWidth = input1.size(3)+ 2 * pad_size; int height = input1.size(2); int width = input1.size(3); rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels}); rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels}); gradInput1.resize_({batchSize, nInputChannels, height, width}); gradInput2.resize_({batchSize, nInputChannels, height, width}); rInput1.fill_(0); rInput2.fill_(0); gradInput1.fill_(0); gradInput2.fill_(0); int success = correlation_backward_cuda_kernel(gradOutput, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2), gradOutput.size(3), gradOutput.stride(0), gradOutput.stride(1), gradOutput.stride(2), gradOutput.stride(3), input1, input1.size(1), input1.size(2), input1.size(3), input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3), input2, input2.stride(0), input2.stride(1), input2.stride(2), input2.stride(3), gradInput1, gradInput1.stride(0), gradInput1.stride(1), gradInput1.stride(2), gradInput1.stride(3), gradInput2, gradInput2.size(1), gradInput2.stride(0), gradInput2.stride(1), gradInput2.stride(2), gradInput2.stride(3), rInput1, rInput2, pad_size, kernel_size, max_displacement, stride1, stride2, corr_type_multiply, // at::globalContext().getCurrentCUDAStream() //works for 0.4.1 at::cuda::getCurrentCUDAStream() //works for 1.0.0 ); if (!success) { AT_ERROR("CUDA call failed"); } return 1; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)"); m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)"); } ================================================ FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cu ================================================ #include #include "correlation_cuda_kernel.cuh" #define CUDA_NUM_THREADS 1024 #define THREADS_PER_BLOCK 32 #define FULL_MASK 0xffffffff #include #include #include #include using at::Half; template __forceinline__ __device__ scalar_t warpReduceSum(scalar_t val) { for (int offset = 16; offset > 0; offset /= 2) val += __shfl_down_sync(FULL_MASK, val, offset); return val; } template __forceinline__ __device__ scalar_t blockReduceSum(scalar_t val) { static __shared__ scalar_t shared[32]; int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; val = warpReduceSum(val); if (lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) val = warpReduceSum(val); return val; } template __global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size) { // n (batch size), c (num of channels), y (height), x (width) int n = blockIdx.x; int y = blockIdx.y; int x = blockIdx.z; int ch_off = threadIdx.x; scalar_t value; int dimcyx = channels * height * width; int dimyx = height * width; int p_dimx = (width + 2 * pad_size); int p_dimy = (height + 2 * pad_size); int p_dimyxc = channels * p_dimy * p_dimx; int p_dimxc = p_dimx * channels; for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) { value = input[n * dimcyx + c * dimyx + y * width + x]; rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value; } } template __global__ void correlation_forward(scalar_t* __restrict__ output, const int nOutputChannels, const int outputHeight, const int outputWidth, const scalar_t* __restrict__ rInput1, const int nInputChannels, const int inputHeight, const int inputWidth, const scalar_t* __restrict__ rInput2, const int pad_size, const int kernel_size, const int max_displacement, const int stride1, const int stride2) { int32_t pInputWidth = inputWidth + 2 * pad_size; int32_t pInputHeight = inputHeight + 2 * pad_size; int32_t kernel_rad = (kernel_size - 1) / 2; int32_t displacement_rad = max_displacement / stride2; int32_t displacement_size = 2 * displacement_rad + 1; int32_t n = blockIdx.x; int32_t y1 = blockIdx.y * stride1 + max_displacement; int32_t x1 = blockIdx.z * stride1 + max_displacement; int32_t c = threadIdx.x; int32_t pdimyxc = pInputHeight * pInputWidth * nInputChannels; int32_t pdimxc = pInputWidth * nInputChannels; int32_t pdimc = nInputChannels; int32_t tdimcyx = nOutputChannels * outputHeight * outputWidth; int32_t tdimyx = outputHeight * outputWidth; int32_t tdimx = outputWidth; int32_t nelems = kernel_size * kernel_size * pdimc; // element-wise product along channel axis for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) { for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) { int x2 = x1 + ti * stride2; int y2 = y1 + tj * stride2; float acc0 = 0.0f; for (int j = -kernel_rad; j <= kernel_rad; ++j) { for (int i = -kernel_rad; i <= kernel_rad; ++i) { // THREADS_PER_BLOCK #pragma unroll for (int ch = c; ch < pdimc; ch += blockDim.x) { int indx1 = n * pdimyxc + (y1 + j) * pdimxc + (x1 + i) * pdimc + ch; int indx2 = n * pdimyxc + (y2 + j) * pdimxc + (x2 + i) * pdimc + ch; acc0 += static_cast(rInput1[indx1] * rInput2[indx2]); } } } if (blockDim.x == warpSize) { __syncwarp(); acc0 = warpReduceSum(acc0); } else { __syncthreads(); acc0 = blockReduceSum(acc0); } if (threadIdx.x == 0) { int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad); const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx + blockIdx.z; output[tindx] = static_cast(acc0 / nelems); } } } } template __global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth, const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth, const scalar_t* __restrict__ rInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2) { // n (batch size), c (num of channels), y (height), x (width) int n = item; int y = blockIdx.x * stride1 + pad_size; int x = blockIdx.y * stride1 + pad_size; int c = blockIdx.z; int tch_off = threadIdx.x; int kernel_rad = (kernel_size - 1) / 2; int displacement_rad = max_displacement / stride2; int displacement_size = 2 * displacement_rad + 1; int xmin = (x - kernel_rad - max_displacement) / stride1; int ymin = (y - kernel_rad - max_displacement) / stride1; int xmax = (x + kernel_rad - max_displacement) / stride1; int ymax = (y + kernel_rad - max_displacement) / stride1; if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) { // assumes gradInput1 is pre-allocated and zero filled return; } if (xmin > xmax || ymin > ymax) { // assumes gradInput1 is pre-allocated and zero filled return; } xmin = max(0,xmin); xmax = min(outputWidth-1,xmax); ymin = max(0,ymin); ymax = min(outputHeight-1,ymax); int pInputWidth = inputWidth + 2 * pad_size; int pInputHeight = inputHeight + 2 * pad_size; int pdimyxc = pInputHeight * pInputWidth * nInputChannels; int pdimxc = pInputWidth * nInputChannels; int pdimc = nInputChannels; int tdimcyx = nOutputChannels * outputHeight * outputWidth; int tdimyx = outputHeight * outputWidth; int tdimx = outputWidth; int odimcyx = nInputChannels * inputHeight* inputWidth; int odimyx = inputHeight * inputWidth; int odimx = inputWidth; scalar_t nelems = kernel_size * kernel_size * nInputChannels; __shared__ scalar_t prod_sum[THREADS_PER_BLOCK]; prod_sum[tch_off] = 0; for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) { int i2 = (tc % displacement_size - displacement_rad) * stride2; int j2 = (tc / displacement_size - displacement_rad) * stride2; int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c; scalar_t val2 = rInput2[indx2]; for (int j = ymin; j <= ymax; ++j) { for (int i = xmin; i <= xmax; ++i) { int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i; prod_sum[tch_off] += gradOutput[tindx] * val2; } } } __syncthreads(); if(tch_off == 0) { scalar_t reduce_sum = 0; for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) { reduce_sum += prod_sum[idx]; } const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size); gradInput1[indx1] = reduce_sum / nelems; } } template __global__ void correlation_backward_input2(int item, scalar_t* gradInput2, int nInputChannels, int inputHeight, int inputWidth, const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth, const scalar_t* __restrict__ rInput1, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2) { // n (batch size), c (num of channels), y (height), x (width) int n = item; int y = blockIdx.x * stride1 + pad_size; int x = blockIdx.y * stride1 + pad_size; int c = blockIdx.z; int tch_off = threadIdx.x; int kernel_rad = (kernel_size - 1) / 2; int displacement_rad = max_displacement / stride2; int displacement_size = 2 * displacement_rad + 1; int pInputWidth = inputWidth + 2 * pad_size; int pInputHeight = inputHeight + 2 * pad_size; int pdimyxc = pInputHeight * pInputWidth * nInputChannels; int pdimxc = pInputWidth * nInputChannels; int pdimc = nInputChannels; int tdimcyx = nOutputChannels * outputHeight * outputWidth; int tdimyx = outputHeight * outputWidth; int tdimx = outputWidth; int odimcyx = nInputChannels * inputHeight* inputWidth; int odimyx = inputHeight * inputWidth; int odimx = inputWidth; scalar_t nelems = kernel_size * kernel_size * nInputChannels; __shared__ scalar_t prod_sum[THREADS_PER_BLOCK]; prod_sum[tch_off] = 0; for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) { int i2 = (tc % displacement_size - displacement_rad) * stride2; int j2 = (tc / displacement_size - displacement_rad) * stride2; int xmin = (x - kernel_rad - max_displacement - i2) / stride1; int ymin = (y - kernel_rad - max_displacement - j2) / stride1; int xmax = (x + kernel_rad - max_displacement - i2) / stride1; int ymax = (y + kernel_rad - max_displacement - j2) / stride1; if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) { // assumes gradInput2 is pre-allocated and zero filled continue; } if (xmin > xmax || ymin > ymax) { // assumes gradInput2 is pre-allocated and zero filled continue; } xmin = max(0,xmin); xmax = min(outputWidth-1,xmax); ymin = max(0,ymin); ymax = min(outputHeight-1,ymax); int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c; scalar_t val1 = rInput1[indx1]; for (int j = ymin; j <= ymax; ++j) { for (int i = xmin; i <= xmax; ++i) { int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i; prod_sum[tch_off] += gradOutput[tindx] * val1; } } } __syncthreads(); if(tch_off == 0) { scalar_t reduce_sum = 0; for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) { reduce_sum += prod_sum[idx]; } const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size); gradInput2[indx2] = reduce_sum / nelems; } } int correlation_forward_cuda_kernel(at::Tensor& output, int ob, int oc, int oh, int ow, int osb, int osc, int osh, int osw, at::Tensor& input1, int ic, int ih, int iw, int isb, int isc, int ish, int isw, at::Tensor& input2, int gc, int gsb, int gsc, int gsh, int gsw, at::Tensor& rInput1, at::Tensor& rInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply, cudaStream_t stream) { int batchSize = ob; int nInputChannels = ic; int inputWidth = iw; int inputHeight = ih; int nOutputChannels = oc; int outputWidth = ow; int outputHeight = oh; dim3 blocks_grid(batchSize, inputHeight, inputWidth); dim3 threads_block(THREADS_PER_BLOCK); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] { channels_first<<>>( input1.data(), rInput1.data(), nInputChannels, inputHeight, inputWidth, pad_size); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] { channels_first<<>> ( input2.data(), rInput2.data(), nInputChannels, inputHeight, inputWidth, pad_size); })); dim3 threadsPerBlock(THREADS_PER_BLOCK); dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] { correlation_forward<<>> (output.data(), nOutputChannels, outputHeight, outputWidth, rInput1.data(), nInputChannels, inputHeight, inputWidth, rInput2.data(), pad_size, kernel_size, max_displacement, stride1, stride2); })); cudaError_t err = cudaGetLastError(); // check for errors if (err != cudaSuccess) { printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err)); return 0; } return 1; } int correlation_backward_cuda_kernel( at::Tensor& gradOutput, int gob, int goc, int goh, int gow, int gosb, int gosc, int gosh, int gosw, at::Tensor& input1, int ic, int ih, int iw, int isb, int isc, int ish, int isw, at::Tensor& input2, int gsb, int gsc, int gsh, int gsw, at::Tensor& gradInput1, int gisb, int gisc, int gish, int gisw, at::Tensor& gradInput2, int ggc, int ggsb, int ggsc, int ggsh, int ggsw, at::Tensor& rInput1, at::Tensor& rInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply, cudaStream_t stream) { int batchSize = gob; int num = batchSize; int nInputChannels = ic; int inputWidth = iw; int inputHeight = ih; int nOutputChannels = goc; int outputWidth = gow; int outputHeight = goh; dim3 blocks_grid(batchSize, inputHeight, inputWidth); dim3 threads_block(THREADS_PER_BLOCK); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] { channels_first<<>>( input1.data(), rInput1.data(), nInputChannels, inputHeight, inputWidth, pad_size ); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] { channels_first<<>>( input2.data(), rInput2.data(), nInputChannels, inputHeight, inputWidth, pad_size ); })); dim3 threadsPerBlock(THREADS_PER_BLOCK); dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels); for (int n = 0; n < num; ++n) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] { correlation_backward_input1<<>> ( n, gradInput1.data(), nInputChannels, inputHeight, inputWidth, gradOutput.data(), nOutputChannels, outputHeight, outputWidth, rInput2.data(), pad_size, kernel_size, max_displacement, stride1, stride2); })); } for(int n = 0; n < batchSize; n++) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] { correlation_backward_input2<<>>( n, gradInput2.data(), nInputChannels, inputHeight, inputWidth, gradOutput.data(), nOutputChannels, outputHeight, outputWidth, rInput1.data(), pad_size, kernel_size, max_displacement, stride1, stride2); })); } // check for errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err)); return 0; } return 1; } ================================================ FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cuh ================================================ #pragma once #include #include #include int correlation_forward_cuda_kernel(at::Tensor& output, int ob, int oc, int oh, int ow, int osb, int osc, int osh, int osw, at::Tensor& input1, int ic, int ih, int iw, int isb, int isc, int ish, int isw, at::Tensor& input2, int gc, int gsb, int gsc, int gsh, int gsw, at::Tensor& rInput1, at::Tensor& rInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply, cudaStream_t stream); int correlation_backward_cuda_kernel( at::Tensor& gradOutput, int gob, int goc, int goh, int gow, int gosb, int gosc, int gosh, int gosw, at::Tensor& input1, int ic, int ih, int iw, int isb, int isc, int ish, int isw, at::Tensor& input2, int gsb, int gsc, int gsh, int gsw, at::Tensor& gradInput1, int gisb, int gisc, int gish, int gisw, at::Tensor& gradInput2, int ggc, int ggsb, int ggsc, int ggsh, int ggsw, at::Tensor& rInput1, at::Tensor& rInput2, int pad_size, int kernel_size, int max_displacement, int stride1, int stride2, int corr_type_multiply, cudaStream_t stream); ================================================ FILE: PWCNet/correlation_package_pytorch1_0/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='correlation_cuda', ext_modules=[ CUDAExtension('correlation_cuda', [ 'correlation_cuda.cc', 'correlation_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: PWCNet/models/PWCNet.py ================================================ """ implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018 Jinwei Gu and Zhile Ren """ import torch import torch.nn as nn from torch.autograd import Variable import os os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory from correlation_package.modules.corr import Correlation import numpy as np __all__ = [ 'pwc_dc_net', 'pwc_dc_net_old' ] def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1): return nn.Sequential( nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True), nn.LeakyReLU(0.1)) def predict_flow(in_planes): return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True) def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1): return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True) class PWCDCNet(nn.Module): """ PWC-DC net. add dilation convolution and densenet connections """ def __init__(self, md=4): """ input: md --- maximum displacement (for correlation. default: 4), after warpping """ super(PWCDCNet,self).__init__() self.conv1a = conv(3, 16, kernel_size=3, stride=2) self.conv1aa = conv(16, 16, kernel_size=3, stride=1) self.conv1b = conv(16, 16, kernel_size=3, stride=1) self.conv2a = conv(16, 32, kernel_size=3, stride=2) self.conv2aa = conv(32, 32, kernel_size=3, stride=1) self.conv2b = conv(32, 32, kernel_size=3, stride=1) self.conv3a = conv(32, 64, kernel_size=3, stride=2) self.conv3aa = conv(64, 64, kernel_size=3, stride=1) self.conv3b = conv(64, 64, kernel_size=3, stride=1) self.conv4a = conv(64, 96, kernel_size=3, stride=2) self.conv4aa = conv(96, 96, kernel_size=3, stride=1) self.conv4b = conv(96, 96, kernel_size=3, stride=1) self.conv5a = conv(96, 128, kernel_size=3, stride=2) self.conv5aa = conv(128,128, kernel_size=3, stride=1) self.conv5b = conv(128,128, kernel_size=3, stride=1) self.conv6aa = conv(128,196, kernel_size=3, stride=2) self.conv6a = conv(196,196, kernel_size=3, stride=1) self.conv6b = conv(196,196, kernel_size=3, stride=1) self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1) self.leakyRELU = nn.LeakyReLU(0.1) nd = (2*md+1)**2 dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int) dd = [int(d) for d in dd] od = nd self.conv6_0 = conv(od, 128, kernel_size=3, stride=1) self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow6 = predict_flow(od+dd[4]) self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+128+4 self.conv5_0 = conv(od, 128, kernel_size=3, stride=1) self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow5 = predict_flow(od+dd[4]) self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+96+4 self.conv4_0 = conv(od, 128, kernel_size=3, stride=1) self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow4 = predict_flow(od+dd[4]) self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+64+4 self.conv3_0 = conv(od, 128, kernel_size=3, stride=1) self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow3 = predict_flow(od+dd[4]) self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+32+4 self.conv2_0 = conv(od, 128, kernel_size=3, stride=1) self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow2 = predict_flow(od+dd[4]) self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv7 = predict_flow(32) for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal(m.weight.data, mode='fan_in') if m.bias is not None: m.bias.data.zero_() def warp(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() # mesh grid xx = torch.arange(0, W).view(1,-1).repeat(H,1) yy = torch.arange(0, H).view(-1,1).repeat(1,W) xx = xx.view(1,1,H,W).repeat(B,1,1,1) yy = yy.view(1,1,H,W).repeat(B,1,1,1) grid = torch.cat((xx,yy),1).float() if x.is_cuda: grid = grid.cuda() vgrid = Variable(grid) + flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) output = nn.functional.grid_sample(x, vgrid) mask = torch.autograd.Variable(torch.ones(x.size())).cuda() mask = nn.functional.grid_sample(mask, vgrid) # if W==128: # np.save('mask.npy', mask.cpu().data.numpy()) # np.save('warp.npy', output.cpu().data.numpy()) mask[mask<0.9999] = 0 mask[mask>0] = 1 return output*mask def forward(self,x): im1 = x[:,:3,:,:] im2 = x[:,3:,:,:] c11 = self.conv1b(self.conv1aa(self.conv1a(im1))) c21 = self.conv1b(self.conv1aa(self.conv1a(im2))) c12 = self.conv2b(self.conv2aa(self.conv2a(c11))) c22 = self.conv2b(self.conv2aa(self.conv2a(c21))) c13 = self.conv3b(self.conv3aa(self.conv3a(c12))) c23 = self.conv3b(self.conv3aa(self.conv3a(c22))) c14 = self.conv4b(self.conv4aa(self.conv4a(c13))) c24 = self.conv4b(self.conv4aa(self.conv4a(c23))) c15 = self.conv5b(self.conv5aa(self.conv5a(c14))) c25 = self.conv5b(self.conv5aa(self.conv5a(c24))) c16 = self.conv6b(self.conv6a(self.conv6aa(c15))) c26 = self.conv6b(self.conv6a(self.conv6aa(c25))) corr6 = self.corr(c16, c26) corr6 = self.leakyRELU(corr6) x = torch.cat((self.conv6_0(corr6), corr6),1) x = torch.cat((self.conv6_1(x), x),1) x = torch.cat((self.conv6_2(x), x),1) x = torch.cat((self.conv6_3(x), x),1) x = torch.cat((self.conv6_4(x), x),1) flow6 = self.predict_flow6(x) up_flow6 = self.deconv6(flow6) up_feat6 = self.upfeat6(x) warp5 = self.warp(c25, up_flow6*0.625) corr5 = self.corr(c15, warp5) corr5 = self.leakyRELU(corr5) x = torch.cat((corr5, c15, up_flow6, up_feat6), 1) x = torch.cat((self.conv5_0(x), x),1) x = torch.cat((self.conv5_1(x), x),1) x = torch.cat((self.conv5_2(x), x),1) x = torch.cat((self.conv5_3(x), x),1) x = torch.cat((self.conv5_4(x), x),1) flow5 = self.predict_flow5(x) up_flow5 = self.deconv5(flow5) up_feat5 = self.upfeat5(x) warp4 = self.warp(c24, up_flow5*1.25) corr4 = self.corr(c14, warp4) corr4 = self.leakyRELU(corr4) x = torch.cat((corr4, c14, up_flow5, up_feat5), 1) x = torch.cat((self.conv4_0(x), x),1) x = torch.cat((self.conv4_1(x), x),1) x = torch.cat((self.conv4_2(x), x),1) x = torch.cat((self.conv4_3(x), x),1) x = torch.cat((self.conv4_4(x), x),1) flow4 = self.predict_flow4(x) up_flow4 = self.deconv4(flow4) up_feat4 = self.upfeat4(x) warp3 = self.warp(c23, up_flow4*2.5) corr3 = self.corr(c13, warp3) corr3 = self.leakyRELU(corr3) x = torch.cat((corr3, c13, up_flow4, up_feat4), 1) x = torch.cat((self.conv3_0(x), x),1) x = torch.cat((self.conv3_1(x), x),1) x = torch.cat((self.conv3_2(x), x),1) x = torch.cat((self.conv3_3(x), x),1) x = torch.cat((self.conv3_4(x), x),1) flow3 = self.predict_flow3(x) up_flow3 = self.deconv3(flow3) up_feat3 = self.upfeat3(x) warp2 = self.warp(c22, up_flow3*5.0) corr2 = self.corr(c12, warp2) corr2 = self.leakyRELU(corr2) x = torch.cat((corr2, c12, up_flow3, up_feat3), 1) x = torch.cat((self.conv2_0(x), x),1) x = torch.cat((self.conv2_1(x), x),1) x = torch.cat((self.conv2_2(x), x),1) x = torch.cat((self.conv2_3(x), x),1) x = torch.cat((self.conv2_4(x), x),1) flow2 = self.predict_flow2(x) x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x)))) flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x))) if self.training: return flow2,flow3,flow4,flow5,flow6 else: return flow2 class PWCDCNet_old(nn.Module): """ PWC-DC net. add dilation convolution and densenet connections """ def __init__(self, md=4): """ input: md --- maximum displacement (for correlation. default: 4), after warpping """ super(PWCDCNet_old,self).__init__() self.conv1a = conv(3, 16, kernel_size=3, stride=2) self.conv1b = conv(16, 16, kernel_size=3, stride=1) self.conv2a = conv(16, 32, kernel_size=3, stride=2) self.conv2b = conv(32, 32, kernel_size=3, stride=1) self.conv3a = conv(32, 64, kernel_size=3, stride=2) self.conv3b = conv(64, 64, kernel_size=3, stride=1) self.conv4a = conv(64, 96, kernel_size=3, stride=2) self.conv4b = conv(96, 96, kernel_size=3, stride=1) self.conv5a = conv(96, 128, kernel_size=3, stride=2) self.conv5b = conv(128,128, kernel_size=3, stride=1) self.conv6a = conv(128,196, kernel_size=3, stride=2) self.conv6b = conv(196,196, kernel_size=3, stride=1) self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1) self.leakyRELU = nn.LeakyReLU(0.1) nd = (2*md+1)**2 dd = np.cumsum([128,128,96,64,32]) od = nd self.conv6_0 = conv(od, 128, kernel_size=3, stride=1) self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow6 = predict_flow(od+dd[4]) self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+128+4 self.conv5_0 = conv(od, 128, kernel_size=3, stride=1) self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow5 = predict_flow(od+dd[4]) self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+96+4 self.conv4_0 = conv(od, 128, kernel_size=3, stride=1) self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow4 = predict_flow(od+dd[4]) self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+64+4 self.conv3_0 = conv(od, 128, kernel_size=3, stride=1) self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow3 = predict_flow(od+dd[4]) self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) od = nd+32+4 self.conv2_0 = conv(od, 128, kernel_size=3, stride=1) self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1) self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1) self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1) self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1) self.predict_flow2 = predict_flow(od+dd[4]) self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2) self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4) self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8) self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16) self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1) self.dc_conv7 = predict_flow(32) for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): nn.init.kaiming_normal(m.weight.data, mode='fan_in') if m.bias is not None: m.bias.data.zero_() def warp(self, x, flo): """ warp an image/tensor (im2) back to im1, according to the optical flow x: [B, C, H, W] (im2) flo: [B, 2, H, W] flow """ B, C, H, W = x.size() # mesh grid xx = torch.arange(0, W).view(1,-1).repeat(H,1) yy = torch.arange(0, H).view(-1,1).repeat(1,W) xx = xx.view(1,1,H,W).repeat(B,1,1,1) yy = yy.view(1,1,H,W).repeat(B,1,1,1) grid = torch.cat((xx,yy),1).float() if x.is_cuda: grid = grid.cuda() vgrid = Variable(grid) + flo # scale grid to [-1,1] vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0 vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0 vgrid = vgrid.permute(0,2,3,1) output = nn.functional.grid_sample(x, vgrid) mask = torch.autograd.Variable(torch.ones(x.size())).cuda() mask = nn.functional.grid_sample(mask, vgrid) mask[mask<0.999] = 0 mask[mask>0] = 1 return output*mask def forward(self,x): im1 = x[:,:3,:,:] im2 = x[:,3:,:,:] c11 = self.conv1b(self.conv1a(im1)) c21 = self.conv1b(self.conv1a(im2)) c12 = self.conv2b(self.conv2a(c11)) c22 = self.conv2b(self.conv2a(c21)) c13 = self.conv3b(self.conv3a(c12)) c23 = self.conv3b(self.conv3a(c22)) c14 = self.conv4b(self.conv4a(c13)) c24 = self.conv4b(self.conv4a(c23)) c15 = self.conv5b(self.conv5a(c14)) c25 = self.conv5b(self.conv5a(c24)) c16 = self.conv6b(self.conv6a(c15)) c26 = self.conv6b(self.conv6a(c25)) corr6 = self.corr(c16, c26) corr6 = self.leakyRELU(corr6) x = torch.cat((corr6, self.conv6_0(corr6)),1) x = torch.cat((self.conv6_1(x), x),1) x = torch.cat((x, self.conv6_2(x)),1) x = torch.cat((x, self.conv6_3(x)),1) x = torch.cat((x, self.conv6_4(x)),1) flow6 = self.predict_flow6(x) up_flow6 = self.deconv6(flow6) up_feat6 = self.upfeat6(x) warp5 = self.warp(c25, up_flow6*0.625) corr5 = self.corr(c15, warp5) corr5 = self.leakyRELU(corr5) x = torch.cat((corr5, c15, up_flow6, up_feat6), 1) x = torch.cat((x, self.conv5_0(x)),1) x = torch.cat((self.conv5_1(x), x),1) x = torch.cat((x, self.conv5_2(x)),1) x = torch.cat((x, self.conv5_3(x)),1) x = torch.cat((x, self.conv5_4(x)),1) flow5 = self.predict_flow5(x) up_flow5 = self.deconv5(flow5) up_feat5 = self.upfeat5(x) warp4 = self.warp(c24, up_flow5*1.25) corr4 = self.corr(c14, warp4) corr4 = self.leakyRELU(corr4) x = torch.cat((corr4, c14, up_flow5, up_feat5), 1) x = torch.cat((x, self.conv4_0(x)),1) x = torch.cat((self.conv4_1(x), x),1) x = torch.cat((x, self.conv4_2(x)),1) x = torch.cat((x, self.conv4_3(x)),1) x = torch.cat((x, self.conv4_4(x)),1) flow4 = self.predict_flow4(x) up_flow4 = self.deconv4(flow4) up_feat4 = self.upfeat4(x) warp3 = self.warp(c23, up_flow4*2.5) corr3 = self.corr(c13, warp3) corr3 = self.leakyRELU(corr3) x = torch.cat((corr3, c13, up_flow4, up_feat4), 1) x = torch.cat((x, self.conv3_0(x)),1) x = torch.cat((self.conv3_1(x), x),1) x = torch.cat((x, self.conv3_2(x)),1) x = torch.cat((x, self.conv3_3(x)),1) x = torch.cat((x, self.conv3_4(x)),1) flow3 = self.predict_flow3(x) up_flow3 = self.deconv3(flow3) up_feat3 = self.upfeat3(x) warp2 = self.warp(c22, up_flow3*5.0) corr2 = self.corr(c12, warp2) corr2 = self.leakyRELU(corr2) x = torch.cat((corr2, c12, up_flow3, up_feat3), 1) x = torch.cat((x, self.conv2_0(x)),1) x = torch.cat((self.conv2_1(x), x),1) x = torch.cat((x, self.conv2_2(x)),1) x = torch.cat((x, self.conv2_3(x)),1) x = torch.cat((x, self.conv2_4(x)),1) flow2 = self.predict_flow2(x) x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x)))) flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x))) if self.training: return flow2,flow3,flow4,flow5,flow6 else: return flow2 def pwc_dc_net(path=None): model = PWCDCNet() if path is not None: data = torch.load(path) if 'state_dict' in data.keys(): model.load_state_dict(data['state_dict']) else: model.load_state_dict(data) return model def pwc_dc_net_old(path=None): model = PWCDCNet_old() if path is not None: data = torch.load(path) if 'state_dict' in data.keys(): model.load_state_dict(data['state_dict']) else: model.load_state_dict(data) return model ================================================ FILE: PWCNet/models/__init__.py ================================================ from .PWCNet import * ================================================ FILE: README.md ================================================ # DAIN (Depth-Aware Video Frame Interpolation) [Project](https://sites.google.com/view/wenbobao/dain) **|** [Paper](http://arxiv.org/abs/1904.00830) [Wenbo Bao](https://sites.google.com/view/wenbobao/home), [Wei-Sheng Lai](http://graduatestudents.ucmerced.edu/wlai24/), [Chao Ma](https://sites.google.com/site/chaoma99/), Xiaoyun Zhang, Zhiyong Gao, and [Ming-Hsuan Yang](http://faculty.ucmerced.edu/mhyang/) IEEE Conference on Computer Vision and Pattern Recognition, Long Beach, CVPR 2019 This work is developed based on our TPAMI work [MEMC-Net](https://github.com/baowenbo/MEMC-Net), where we propose the adaptive warping layer. Please also consider referring to it. ### Table of Contents 1. [Introduction](#introduction) 1. [Citation](#citation) 1. [Requirements and Dependencies](#requirements-and-dependencies) 1. [Installation](#installation) 1. [Testing Pre-trained Models](#testing-pre-trained-models) 1. [Downloading Results](#downloading-results) 1. [Slow-motion Generation](#slow-motion-generation) 1. [Training New Models](#training-new-models) 1. [Google Colab Demo](#google-colab-demo) ### Introduction We propose the **D**epth-**A**ware video frame **IN**terpolation (**DAIN**) model to explicitly detect the occlusion by exploring the depth cue. We develop a depth-aware flow projection layer to synthesize intermediate flows that preferably sample closer objects than farther ones. Our method achieves state-of-the-art performance on the Middlebury dataset. We provide videos [here](https://www.youtube.com/watch?v=-f8f0igQi5I&t=5s).

### Citation If you find the code and datasets useful in your research, please cite: @inproceedings{DAIN, author = {Bao, Wenbo and Lai, Wei-Sheng and Ma, Chao and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan}, title = {Depth-Aware Video Frame Interpolation}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, year = {2019} } @article{MEMC-Net, title={MEMC-Net: Motion Estimation and Motion Compensation Driven Neural Network for Video Interpolation and Enhancement}, author={Bao, Wenbo and Lai, Wei-Sheng, and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, doi={10.1109/TPAMI.2019.2941941}, year={2018} } ### Requirements and Dependencies - Ubuntu (We test with Ubuntu = 16.04.5 LTS) - Python (We test with Python = 3.6.8 in Anaconda3 = 4.1.1) - Cuda & Cudnn (We test with Cuda = 9.0 and Cudnn = 7.0) - PyTorch (The customized depth-aware flow projection and other layers require ATen API in PyTorch = 1.0.0) - GCC (Compiling PyTorch 1.0.0 extension files (.c/.cu) requires gcc = 4.9.1 and nvcc = 9.0 compilers) - NVIDIA GPU (We use Titan X (Pascal) with compute = 6.1, but we support compute_50/52/60/61 devices, should you have devices with higher compute capability, please revise [this](https://github.com/baowenbo/DAIN/blob/master/my_package/DepthFlowProjection/setup.py)) ### Installation Download repository: $ git clone https://github.com/baowenbo/DAIN.git Before building Pytorch extensions, be sure you have `pytorch >= 1.0.0`: $ python -c "import torch; print(torch.__version__)" Generate our PyTorch extensions: $ cd DAIN $ cd my_package $ ./build.sh Generate the Correlation package required by [PWCNet](https://github.com/NVlabs/PWC-Net/tree/master/PyTorch/external_packages/correlation-pytorch-master): $ cd ../PWCNet/correlation_package_pytorch1_0 $ ./build.sh ### Testing Pre-trained Models Make model weights dir and Middlebury dataset dir: $ cd DAIN $ mkdir model_weights $ mkdir MiddleBurySet Download pretrained models, $ cd model_weights $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth and Middlebury dataset: $ cd ../MiddleBurySet $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-color-allframes.zip $ unzip other-color-allframes.zip $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-gt-interp.zip $ unzip other-gt-interp.zip $ cd .. preinstallations: $ cd PWCNet/correlation_package_pytorch1_0 $ sh build.sh $ cd ../my_package $ sh build.sh $ cd .. We are good to go by: $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py The interpolated results are under `MiddleBurySet/other-result-author/[random number]/`, where the `random number` is used to distinguish different runnings. ### Downloading Results Our DAIN model achieves the state-of-the-art performance on the UCF101, Vimeo90K, and Middlebury ([*eval*](http://vision.middlebury.edu/flow/eval/results/results-n1.php) and *other*). Download our interpolated results with: $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/UCF101_DAIN.zip $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Vimeo90K_interp_DAIN.zip $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_eval_DAIN.zip $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_other_DAIN.zip ### Slow-motion Generation Our model is fully capable of generating slow-motion effect with minor modification on the network architecture. Run the following code by specifying `time_step = 0.25` to generate x4 slow-motion effect: $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.25 or set `time_step` to `0.125` or `0.1` as follows $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.125 $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.1 to generate x8 and x10 slow-motion respectively. Or if you would like to have x100 slow-motion for a little fun. $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.01 You may also want to create gif animations by: $ cd MiddleBurySet/other-result-author/[random number]/Beanbags $ convert -delay 1 *.png -loop 0 Beanbags.gif //1*10ms delay Have fun and enjoy yourself! ### Training New Models Download the Vimeo90K triplet dataset for video frame interpolation task, also see [here](https://github.com/anchen1011/toflow/blob/master/download_dataset.sh) by [Xue et al., IJCV19](https://arxiv.org/abs/1711.09078). $ cd DAIN $ mkdir /path/to/your/dataset & cd /path/to/your/dataset $ wget http://data.csail.mit.edu/tofu/dataset/vimeo_triplet.zip $ unzip vimeo_triplet.zip $ rm vimeo_triplet.zip Download the pretrained MegaDepth and PWCNet models $ cd MegaDepth/checkpoints/test_local $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best_generalization_net_G.pth $ cd ../../../PWCNet $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/pwc_net.pth.tar $ cd .. Run the training script: $ CUDA_VISIBLE_DEVICES=0 python train.py --datasetPath /path/to/your/dataset --batch_size 1 --save_which 1 --lr 0.0005 --rectify_lr 0.0005 --flow_lr_coe 0.01 --occ_lr_coe 0.0 --filter_lr_coe 1.0 --ctx_lr_coe 1.0 --alpha 0.0 1.0 --patience 4 --factor 0.2 The optimized models will be saved to the `model_weights/[random number]` directory, where [random number] is generated for different runs. Replace the pre-trained `model_weights/best.pth` model with the newly trained `model_weights/[random number]/best.pth` model. Then test the new model by executing: $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py ### Google Colab Demo This is a modification of DAIN that allows the usage of Google Colab and is able to do a full demo interpolation from a source video to a target video. Original Notebook File by btahir can be found [here](https://github.com/baowenbo/DAIN/issues/44). To use the Colab, follow these steps: - Download the `Colab_DAIN.ipynb` file ([link](https://raw.githubusercontent.com/baowenbo/DAIN/master/Colab_DAIN.ipynb)). - Visit Google Colaboratory ([link](https://colab.research.google.com/)) - Select the "Upload" option, and upload the `.ipynb` file - Start running the cells one by one, following the instructions. Colab file authors: [Styler00Dollar](https://github.com/styler00dollar) and [Alpha](https://github.com/AlphaGit). ### Contact [Wenbo Bao](mailto:bwb0813@gmail.com); [Wei-Sheng (Jason) Lai](mailto:phoenix104104@gmail.com) ### License See [MIT License](https://github.com/baowenbo/DAIN/blob/master/LICENSE) ================================================ FILE: Resblock/BasicBlock.py ================================================ import torch.nn as nn import math import torch.utils.model_zoo as model_zoo import torch.nn.init as weight_init import torch __all__ = ['MultipleBasicBlock','MultipleBasicBlock_4'] def conv3x3(in_planes, out_planes, dilation = 1, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=int(dilation*(3-1)/2), dilation=dilation, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes,dilation, stride) # self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) # self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) # weight_init.xavier_normal() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, x): residual = x out = self.conv1(x) # out = self.bn1(out) out = self.relu(out) out = self.conv2(out) # out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class MultipleBasicBlock(nn.Module): def __init__(self,input_feature, block, num_blocks, intermediate_feature = 64, dense = True): super(MultipleBasicBlock, self).__init__() self.dense = dense self.num_block = num_blocks self.intermediate_feature = intermediate_feature self.block1= nn.Sequential(*[ nn.Conv2d(input_feature, intermediate_feature, kernel_size=7, stride=1, padding=3, bias=True), nn.ReLU(inplace=True) ]) # for i in range(1, num_blocks): self.block2 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=2 else None self.block3 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=3 else None self.block4 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=4 else None self.block5 = nn.Sequential(*[nn.Conv2d(intermediate_feature, 3 , (3, 3), 1, (1, 1))]) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, x): x = self.block1(x) x = self.block2(x) if self.num_block>=2 else x x = self.block3(x) if self.num_block>=3 else x x = self.block4(x) if self.num_block== 4 else x x = self.block5(x) return x def MultipleBasicBlock_4(input_feature,intermediate_feature = 64): model = MultipleBasicBlock(input_feature, BasicBlock,4 , intermediate_feature) return model if __name__ == '__main__': # x= Variable(torch.randn(2,3,224,448)) # model = S2DF(BasicBlock,3,True) # y = model(x) model = MultipleBasicBlock(200, BasicBlock,4) model = BasicBlock(64,64,1) # y = model(x) exit(0) ================================================ FILE: Resblock/__init__.py ================================================ from .BasicBlock import * ================================================ FILE: S2D_models/S2DF.py ================================================ import torch.nn as nn import math import torch.utils.model_zoo as model_zoo import torch # __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', # 'resnet152','resnet18_conv1'] __all__ = ['S2DF','S2DF_3dense','S2DF_3dense_nodilation', 'S2DF_3last','S2DF_2dense', 'BasicBlock'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, dilation = 1, stride=1): "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=int(dilation*(3-1)/2), dilation=dilation, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes,dilation, stride) # self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) # self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) # out = self.bn1(out) out = self.relu(out) out = self.conv2(out) # out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) # self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=int(dilation*(3-1)/2), dilation = dilation, bias=False) # self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) # self.bn3 = nn.BatchNorm2d(planes * 4) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) # out = self.bn1(out) out = self.relu(out) out = self.conv2(out) # out = self.bn2(out) out = self.relu(out) out = self.conv3(out) # out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class S2DF(nn.Module): def __init__(self, block, num_blocks,dense = True,dilation=True): self.inplanes = 64 super(S2DF, self).__init__() self.dense = dense self.num_block = num_blocks assert(num_blocks>=1 and num_blocks<=4) self.block1 = nn.Sequential(*[ nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False), nn.ReLU(inplace=True) ]) self.dilation = dilation # for i in range(1, num_blocks): self.block2 = block(self.inplanes, 64, dilation = 4 if dilation else 1) if num_blocks>=2 else None self.block3 = block(self.inplanes, 64, dilation = 8 if dilation else 1) if num_blocks>=3 else None self.block4 = block(self.inplanes, 64, dilation = 16 if dilation else 1) if num_blocks>=4 else None for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, x): y = [] y.append(x) #raw feature x = self.block1(x) if (self.num_block > 1 and self.dense) or self.num_block == 1: y.append(x) x = self.block2(x) if self.num_block>=2 else x if (self.num_block > 2 and self.dense) or self.num_block == 2: y.append(x) x = self.block3(x) if self.num_block>=3 else x if (self.num_block > 3 and self.dense) or self.num_block == 3: y.append(x) x = self.block4(x) if self.num_block== 4 else x if self.num_block == 4 : y.append(x) return torch.cat(y,dim=1) class S2DFsim(nn.Module): def __init__(self, block, num_blocks,dense = True,dilation=True): self.inplanes = 64 super(S2DFsim, self).__init__() self.dense = dense self.num_block = num_blocks assert(num_blocks>=1 and num_blocks<=4) self.block1 = nn.Sequential(*[ nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), ]) self.dilation = dilation # for i in range(1, num_blocks): self.block2 = nn.Sequential(*[ nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), ]) if num_blocks >= 2 else None self.block3 = nn.Sequential(*[ nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), ]) if num_blocks >= 3 else None self.block4 = nn.Sequential(*[ nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False), ]) if num_blocks >= 4 else None # for m in self.modules(): # if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) # elif isinstance(m, nn.BatchNorm2d): # m.weight.data.fill_(1) # m.bias.data.zero_() def forward(self, x): y = [] y.append(x) #raw feature x = self.block1(x) if (self.num_block > 1 and self.dense) or self.num_block == 1: y.append(x) x = self.block2(x) if self.num_block>=2 else x if (self.num_block > 2 and self.dense) or self.num_block == 2: y.append(x) x = self.block3(x) if self.num_block>=3 else x if (self.num_block > 3 and self.dense) or self.num_block == 3: y.append(x) x = self.block4(x) if self.num_block== 4 else x if self.num_block == 4 : y.append(x) return torch.cat(y,dim=1) def S2DF_3dense_nodilation(): model = S2DFsim(None,3,dense=True,dilation=False) return model def S2DF_3dense(): model = S2DF(BasicBlock,3,dense=True) return model def S2DF_3last(): model = S2DF(BasicBlock,3,dense=False) return model def S2DF_2dense(): model = S2DF(BasicBlock,2,dense=True) return model from torch.autograd import Variable if __name__ == '__main__': x= Variable(torch.randn(2,3,224,448)) # model = S2DF(BasicBlock,3,True) # y = model(x) model = S2DF(BasicBlock,4,False) y = model(x) exit(0) ================================================ FILE: S2D_models/__init__.py ================================================ from .S2DF import * ================================================ FILE: Stack.py ================================================ class Stack: def __init__(self): self.stack = [] def pop(self): if self.is_empty(): return None else: return self.stack.pop() def push(self,val): return self.stack.append(val) def peak(self): if self.is_empty(): return None else: return self.stack[-1] def size(self): return len(self.stack) def is_empty(self): return self.size() == 0 ================================================ FILE: balancedsampler.py ================================================ from torch.utils.data.sampler import Sampler import torch class RandomBalancedSampler(Sampler): """Samples elements randomly, with an arbitrary size, independant from dataset length. this is a balanced sampling that will sample the whole dataset with a random permutation. Arguments: data_source (Dataset): dataset to sample from """ def __init__(self, data_source, epoch_size): self.data_size = len(data_source) self.epoch_size = epoch_size self.index = 0 def __next__(self): if self.index == 0: #re-shuffle the sampler self.indices = torch.randperm(self.data_size) self.index = (self.index+1)%self.data_size return self.indices[self.index] def next(self): return self.__next__() def __iter__(self): return self def __len__(self): return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size class SequentialBalancedSampler(Sampler): """Samples elements dequentially, with an arbitrary size, independant from dataset length. this is a balanced sampling that will sample the whole dataset before resetting it. Arguments: data_source (Dataset): dataset to sample from """ def __init__(self, data_source, epoch_size): self.data_size = len(data_source) self.epoch_size = epoch_size self.index = 0 def __next__(self): self.index = (self.index+1)%self.data_size return self.index def next(self): return self.__next__() def __iter__(self): return self def __len__(self): return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size ================================================ FILE: colab_interpolate.py ================================================ import time import os from torch.autograd import Variable import torch import numpy as np import numpy import networks from my_args import args from imageio import imread, imsave from AverageMeter import * import shutil import datetime torch.backends.cudnn.benchmark = True model = networks.__dict__[args.netName]( channel = args.channels, filter_size = args.filter_size, timestep = args.time_step, training = False) if args.use_cuda: model = model.cuda() model_path = './model_weights/best.pth' if not os.path.exists(model_path): print("*****************************************************************") print("**** We couldn't load any trained weights ***********************") print("*****************************************************************") exit(1) if args.use_cuda: pretrained_dict = torch.load(model_path) else: pretrained_dict = torch.load(model_path, map_location=lambda storage, loc: storage) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) # 4. release the pretrained dict for saving memory pretrained_dict = [] model = model.eval() # deploy mode frames_dir = args.frame_input_dir output_dir = args.frame_output_dir timestep = args.time_step time_offsets = [kk * timestep for kk in range(1, int(1.0 / timestep))] input_frame = args.start_frame - 1 loop_timer = AverageMeter() final_frame = args.end_frame torch.set_grad_enabled(False) # we want to have input_frame between (start_frame-1) and (end_frame-2) # this is because at each step we read (frame) and (frame+1) # so the last iteration will actuall be (end_frame-1) and (end_frame) while input_frame < final_frame - 1: input_frame += 1 start_time = time.time() filename_frame_1 = os.path.join(frames_dir, f'{input_frame:0>5d}.png') filename_frame_2 = os.path.join(frames_dir, f'{input_frame+1:0>5d}.png') X0 = torch.from_numpy(np.transpose(imread(filename_frame_1), (2,0,1)).astype("float32") / 255.0).type(args.dtype) X1 = torch.from_numpy(np.transpose(imread(filename_frame_2), (2,0,1)).astype("float32") / 255.0).type(args.dtype) assert (X0.size(1) == X1.size(1)) assert (X0.size(2) == X1.size(2)) intWidth = X0.size(2) intHeight = X0.size(1) channels = X0.size(0) if not channels == 3: print(f"Skipping {filename_frame_1}-{filename_frame_2} -- expected 3 color channels but found {channels}.") continue if intWidth != ((intWidth >> 7) << 7): intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary intPaddingLeft = int((intWidth_pad - intWidth) / 2) intPaddingRight = intWidth_pad - intWidth - intPaddingLeft else: intPaddingLeft = 32 intPaddingRight= 32 if intHeight != ((intHeight >> 7) << 7): intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary intPaddingTop = int((intHeight_pad - intHeight) / 2) intPaddingBottom = intHeight_pad - intHeight - intPaddingTop else: intPaddingTop = 32 intPaddingBottom = 32 pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight, intPaddingTop, intPaddingBottom]) X0 = Variable(torch.unsqueeze(X0,0)) X1 = Variable(torch.unsqueeze(X1,0)) X0 = pader(X0) X1 = pader(X1) if args.use_cuda: X0 = X0.cuda() X1 = X1.cuda() y_s, offset, filter = model(torch.stack((X0, X1),dim = 0)) y_ = y_s[args.save_which] if args.use_cuda: X0 = X0.data.cpu().numpy() if not isinstance(y_, list): y_ = y_.data.cpu().numpy() else: y_ = [item.data.cpu().numpy() for item in y_] offset = [offset_i.data.cpu().numpy() for offset_i in offset] filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None X1 = X1.data.cpu().numpy() else: X0 = X0.data.numpy() if not isinstance(y_, list): y_ = y_.data.numpy() else: y_ = [item.data.numpy() for item in y_] offset = [offset_i.data.numpy() for offset_i in offset] filter = [filter_i.data.numpy() for filter_i in filter] X1 = X1.data.numpy() X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft:intPaddingLeft+intWidth], (1, 2, 0)) for item in y_] offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset] filter = [np.transpose( filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth], (1, 2, 0)) for filter_i in filter] if filter is not None else None X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) interpolated_frame_number = 0 shutil.copy(filename_frame_1, os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png")) for item, time_offset in zip(y_, time_offsets): interpolated_frame_number += 1 output_frame_file_path = os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png") imsave(output_frame_file_path, np.round(item).astype(numpy.uint8)) end_time = time.time() loop_timer.update(end_time - start_time) frames_left = final_frame - input_frame estimated_seconds_left = frames_left * loop_timer.avg estimated_time_left = datetime.timedelta(seconds=estimated_seconds_left) print(f"****** Processed frame {input_frame} | Time per frame (avg): {loop_timer.avg:2.2f}s | Time left: {estimated_time_left} ******************" ) # Copying last frame last_frame_filename = os.path.join(frames_dir, str(str(final_frame).zfill(5))+'.png') shutil.copy(last_frame_filename, os.path.join(output_dir, f"{final_frame:0>5d}{0:0>3d}.png")) print("Finished processing images.") ================================================ FILE: datasets/Vimeo_90K_interp.py ================================================ import os.path import random # import glob import math from .listdatasets import ListDataset,Vimeo_90K_loader def make_dataset(root, list_file): raw_im_list = open(os.path.join(root, list_file)).read().splitlines() # the last line is invalid in test set. # print("The last sample is : " + raw_im_list[-1]) raw_im_list = raw_im_list[:-1] assert len(raw_im_list) > 0 random.shuffle(raw_im_list) return raw_im_list def Vimeo_90K_interp(root, split=1.0, single=False, task = 'interp' ): train_list = make_dataset(root,"tri_trainlist.txt") test_list = make_dataset(root,"tri_testlist.txt") train_dataset = ListDataset(root, train_list, loader=Vimeo_90K_loader) test_dataset = ListDataset(root, test_list, loader=Vimeo_90K_loader) return train_dataset, test_dataset ================================================ FILE: datasets/__init__.py ================================================ from .Vimeo_90K_interp import Vimeo_90K_interp __all__ = ( 'Vimeo_90K_interp', ) # Vimeo_90K = "/tmp4/wenbobao_data/vimeo_triplet" ================================================ FILE: datasets/listdatasets.py ================================================ import torch.utils.data as data import os import os.path from scipy.ndimage import imread import numpy as np import random def Vimeo_90K_loader(root, im_path, input_frame_size = (3, 256, 448), output_frame_size = (3, 256, 448), data_aug = True): root = os.path.join(root,'sequences',im_path) if data_aug and random.randint(0, 1): path_pre2 = os.path.join(root, "im1.png") path_mid = os.path.join(root, "im2.png") path_pre1 = os.path.join(root, "im3.png") else: path_pre1 = os.path.join(root, "im1.png") path_mid = os.path.join(root, "im2.png") path_pre2 = os.path.join(root, "im3.png") im_pre2 = imread(path_pre2) im_pre1 = imread(path_pre1) im_mid = imread(path_mid) h_offset = random.choice(range(256 - input_frame_size[1] + 1)) w_offset = random.choice(range(448 - input_frame_size[2] + 1)) im_pre2 = im_pre2[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :] im_pre1 = im_pre1[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :] im_mid = im_mid[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :] if data_aug: if random.randint(0, 1): im_pre2 = np.fliplr(im_pre2) im_mid = np.fliplr(im_mid) im_pre1 = np.fliplr(im_pre1) if random.randint(0, 1): im_pre2 = np.flipud(im_pre2) im_mid = np.flipud(im_mid) im_pre1 = np.flipud(im_pre1) X0 = np.transpose(im_pre1,(2,0,1)) X2 = np.transpose(im_pre2, (2, 0, 1)) y = np.transpose(im_mid, (2, 0, 1)) return X0.astype("float32")/ 255.0, \ X2.astype("float32")/ 255.0,\ y.astype("float32")/ 255.0 class ListDataset(data.Dataset): def __init__(self, root, path_list, loader=Vimeo_90K_loader): self.root = root self.path_list = path_list self.loader = loader def __getitem__(self, index): path = self.path_list[index] # print(path) image_0,image_2,image_1 = self.loader(self.root, path) return image_0,image_2,image_1 def __len__(self): return len(self.path_list) ================================================ FILE: demo_MiddleBury.py ================================================ import time import os from torch.autograd import Variable import math import torch import random import numpy as np import numpy import networks from my_args import args from scipy.misc import imread, imsave from AverageMeter import * torch.backends.cudnn.benchmark = True # to speed up the DO_MiddleBurryOther = True MB_Other_DATA = "./MiddleBurySet/other-data/" MB_Other_RESULT = "./MiddleBurySet/other-result-author/" MB_Other_GT = "./MiddleBurySet/other-gt-interp/" if not os.path.exists(MB_Other_RESULT): os.mkdir(MB_Other_RESULT) model = networks.__dict__[args.netName](channel=args.channels, filter_size = args.filter_size , timestep=args.time_step, training=False) if args.use_cuda: model = model.cuda() args.SAVED_MODEL = './model_weights/best.pth' if os.path.exists(args.SAVED_MODEL): print("The testing model weight is: " + args.SAVED_MODEL) if not args.use_cuda: pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage) # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)) else: pretrained_dict = torch.load(args.SAVED_MODEL) # model.load_state_dict(torch.load(args.SAVED_MODEL)) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) # 4. release the pretrained dict for saving memory pretrained_dict = [] else: print("*****************************************************************") print("**** We don't load any trained weights **************************") print("*****************************************************************") model = model.eval() # deploy mode use_cuda=args.use_cuda save_which=args.save_which dtype = args.dtype unique_id =str(random.randint(0, 100000)) print("The unique id for current testing is: " + str(unique_id)) interp_error = AverageMeter() if DO_MiddleBurryOther: subdir = os.listdir(MB_Other_DATA) gen_dir = os.path.join(MB_Other_RESULT, unique_id) os.mkdir(gen_dir) tot_timer = AverageMeter() proc_timer = AverageMeter() end = time.time() for dir in subdir: print(dir) os.mkdir(os.path.join(gen_dir, dir)) arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png") arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png") arguments_strOut = os.path.join(gen_dir, dir, "frame10i11.png") gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png") X0 = torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype) X1 = torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype) y_ = torch.FloatTensor() assert (X0.size(1) == X1.size(1)) assert (X0.size(2) == X1.size(2)) intWidth = X0.size(2) intHeight = X0.size(1) channel = X0.size(0) if not channel == 3: continue if intWidth != ((intWidth >> 7) << 7): intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary intPaddingLeft =int(( intWidth_pad - intWidth)/2) intPaddingRight = intWidth_pad - intWidth - intPaddingLeft else: intWidth_pad = intWidth intPaddingLeft = 32 intPaddingRight= 32 if intHeight != ((intHeight >> 7) << 7): intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary intPaddingTop = int((intHeight_pad - intHeight) / 2) intPaddingBottom = intHeight_pad - intHeight - intPaddingTop else: intHeight_pad = intHeight intPaddingTop = 32 intPaddingBottom = 32 pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom]) torch.set_grad_enabled(False) X0 = Variable(torch.unsqueeze(X0,0)) X1 = Variable(torch.unsqueeze(X1,0)) X0 = pader(X0) X1 = pader(X1) if use_cuda: X0 = X0.cuda() X1 = X1.cuda() proc_end = time.time() y_s,offset,filter = model(torch.stack((X0, X1),dim = 0)) y_ = y_s[save_which] proc_timer.update(time.time() -proc_end) tot_timer.update(time.time() - end) end = time.time() print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" ) if use_cuda: X0 = X0.data.cpu().numpy() y_ = y_.data.cpu().numpy() offset = [offset_i.data.cpu().numpy() for offset_i in offset] filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None X1 = X1.data.cpu().numpy() else: X0 = X0.data.numpy() y_ = y_.data.numpy() offset = [offset_i.data.numpy() for offset_i in offset] filter = [filter_i.data.numpy() for filter_i in filter] X1 = X1.data.numpy() X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) y_ = np.transpose(255.0 * y_.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset] filter = [np.transpose( filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth], (1, 2, 0)) for filter_i in filter] if filter is not None else None X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) imsave(arguments_strOut, np.round(y_).astype(numpy.uint8)) rec_rgb = imread(arguments_strOut) gt_rgb = imread(gt_path) diff_rgb = 128.0 + rec_rgb - gt_rgb avg_interp_error_abs = np.mean(np.abs(diff_rgb - 128.0)) interp_error.update(avg_interp_error_abs, 1) mse = numpy.mean((diff_rgb - 128.0) ** 2) PIXEL_MAX = 255.0 psnr = 20 * math.log10(PIXEL_MAX / math.sqrt(mse)) print("interpolation error / PSNR : " + str(round(avg_interp_error_abs,4)) + " / " + str(round(psnr,4))) metrics = "The average interpolation error / PSNR for all images are : " + str(round(interp_error.avg, 4)) print(metrics) ================================================ FILE: demo_MiddleBury_slowmotion.py ================================================ import time import os from torch.autograd import Variable import torch import random import numpy as np import numpy import networks from my_args import args from scipy.misc import imread, imsave from AverageMeter import * import shutil torch.backends.cudnn.benchmark = True # to speed up the DO_MiddleBurryOther = True MB_Other_DATA = "./MiddleBurySet/other-data/" MB_Other_RESULT = "./MiddleBurySet/other-result-author/" MB_Other_GT = "./MiddleBurySet/other-gt-interp/" if not os.path.exists(MB_Other_RESULT): os.mkdir(MB_Other_RESULT) model = networks.__dict__[args.netName]( channel=args.channels, filter_size = args.filter_size , timestep=args.time_step, training=False) if args.use_cuda: model = model.cuda() args.SAVED_MODEL = './model_weights/best.pth' if os.path.exists(args.SAVED_MODEL): print("The testing model weight is: " + args.SAVED_MODEL) if not args.use_cuda: pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage) # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)) else: pretrained_dict = torch.load(args.SAVED_MODEL) # model.load_state_dict(torch.load(args.SAVED_MODEL)) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) # 4. release the pretrained dict for saving memory pretrained_dict = [] else: print("*****************************************************************") print("**** We don't load any trained weights **************************") print("*****************************************************************") model = model.eval() # deploy mode use_cuda=args.use_cuda save_which=args.save_which dtype = args.dtype unique_id =str(random.randint(0, 100000)) print("The unique id for current testing is: " + str(unique_id)) interp_error = AverageMeter() if DO_MiddleBurryOther: subdir = os.listdir(MB_Other_DATA) gen_dir = os.path.join(MB_Other_RESULT, unique_id) os.mkdir(gen_dir) tot_timer = AverageMeter() proc_timer = AverageMeter() end = time.time() for dir in subdir: print(dir) os.mkdir(os.path.join(gen_dir, dir)) arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png") arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png") gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png") X0 = torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype) X1 = torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype) y_ = torch.FloatTensor() assert (X0.size(1) == X1.size(1)) assert (X0.size(2) == X1.size(2)) intWidth = X0.size(2) intHeight = X0.size(1) channel = X0.size(0) if not channel == 3: continue if intWidth != ((intWidth >> 7) << 7): intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary intPaddingLeft =int(( intWidth_pad - intWidth)/2) intPaddingRight = intWidth_pad - intWidth - intPaddingLeft else: intWidth_pad = intWidth intPaddingLeft = 32 intPaddingRight= 32 if intHeight != ((intHeight >> 7) << 7): intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary intPaddingTop = int((intHeight_pad - intHeight) / 2) intPaddingBottom = intHeight_pad - intHeight - intPaddingTop else: intHeight_pad = intHeight intPaddingTop = 32 intPaddingBottom = 32 pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom]) torch.set_grad_enabled(False) X0 = Variable(torch.unsqueeze(X0,0)) X1 = Variable(torch.unsqueeze(X1,0)) X0 = pader(X0) X1 = pader(X1) if use_cuda: X0 = X0.cuda() X1 = X1.cuda() proc_end = time.time() y_s,offset,filter = model(torch.stack((X0, X1),dim = 0)) y_ = y_s[save_which] proc_timer.update(time.time() -proc_end) tot_timer.update(time.time() - end) end = time.time() print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" ) if use_cuda: X0 = X0.data.cpu().numpy() if not isinstance(y_, list): y_ = y_.data.cpu().numpy() else: y_ = [item.data.cpu().numpy() for item in y_] offset = [offset_i.data.cpu().numpy() for offset_i in offset] filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None X1 = X1.data.cpu().numpy() else: X0 = X0.data.numpy() if not isinstance(y_, list): y_ = y_.data.numpy() else: y_ = [item.data.numpy() for item in y_] offset = [offset_i.data.numpy() for offset_i in offset] filter = [filter_i.data.numpy() for filter_i in filter] X1 = X1.data.numpy() X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for item in y_] offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset] filter = [np.transpose( filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth], (1, 2, 0)) for filter_i in filter] if filter is not None else None X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) timestep = args.time_step numFrames = int(1.0 / timestep) - 1 time_offsets = [kk * timestep for kk in range(1, 1 + numFrames, 1)] # for item, time_offset in zip(y_,time_offsets): # arguments_strOut = os.path.join(gen_dir, dir, "frame10_i{:.3f}_11.png".format(time_offset)) # # imsave(arguments_strOut, np.round(item).astype(numpy.uint8)) # # # copy the first and second reference frame # shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, "frame10_i{:.3f}_11.png".format(0))) # shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, "frame11_i{:.3f}_11.png".format(1))) count = 0 shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count))) count = count+1 for item, time_offset in zip(y_, time_offsets): arguments_strOut = os.path.join(gen_dir, dir, "{:0>4d}.png".format(count)) count = count + 1 imsave(arguments_strOut, np.round(item).astype(numpy.uint8)) shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count))) count = count + 1 ================================================ FILE: environment.yaml ================================================ name: pytorch1.0.0 channels: - pytorch - serge-sans-paille - anaconda - conda-forge - defaults dependencies: - ca-certificates=2019.1.23=0 - certifi=2018.11.29=py36_0 - cloudpickle=0.7.0=py_0 - cytoolz=0.9.0.1=py36h14c3975_1 - dask-core=1.1.1=py_0 - decorator=4.3.2=py36_0 - imageio=2.4.1=py36_0 - networkx=2.2=py36_1 - openssl=1.1.1=h7b6447c_0 - pywavelets=1.0.1=py36hdd07704_0 - scikit-image=0.14.1=py36he6710b0_0 - scipy=1.1.0=py36h7c811a0_0 - toolz=0.9.0=py36_0 - cycler=0.10.0=py_1 - expat=2.2.5=hf484d3e_1002 - fontconfig=2.13.1=h2176d3f_1000 - gettext=0.19.8.1=h9745a5d_1001 - glib=2.56.2=had28632_1001 - icu=58.2=hf484d3e_1000 - kiwisolver=1.0.1=py36h6bb024c_1002 - libiconv=1.15=h14c3975_1004 - libprotobuf=3.6.1=hdbcaa40_1000 - libuuid=2.32.1=h14c3975_1000 - libxcb=1.13=h14c3975_1002 - libxml2=2.9.8=h143f9aa_1005 - matplotlib=3.0.2=py36_1002 - matplotlib-base=3.0.2=py36h167e16e_1002 - protobuf=3.6.1=py36hf484d3e_1001 - pthread-stubs=0.4=h14c3975_1001 - pyparsing=2.3.1=py_0 - pyqt=5.6.0=py36h13b7fb3_1008 - python-dateutil=2.8.0=py_0 - sip=4.18.1=py36hf484d3e_1000 - tensorboardx=1.6=py_0 - tk=8.6.9=h84994c4_1000 - tornado=5.1.1=py36h14c3975_1000 - xorg-libxau=1.0.9=h14c3975_0 - xorg-libxdmcp=1.1.2=h14c3975_1007 - blas=1.0=mkl - cffi=1.11.5=py36he75722e_1 - cudatoolkit=9.0=h13b8566_0 - dbus=1.13.2=h714fa37_1 - freetype=2.9.1=h8a8886c_1 - gst-plugins-base=1.14.0=hbbd80ab_1 - gstreamer=1.14.0=hb453b48_1 - intel-openmp=2019.1=144 - isl=0.12.2=0 - jpeg=9b=h024ee3a_2 - libedit=3.1.20181209=hc058e9b_0 - libffi=3.2.1=hd88cf55_4 - libgcc-ng=8.2.0=hdf63c60_1 - libgfortran-ng=7.3.0=hdf63c60_0 - libpng=1.6.36=hbc83047_0 - libstdcxx-ng=8.2.0=hdf63c60_1 - libtiff=4.0.10=h2733197_2 - mkl=2019.1=144 - mkl_fft=1.0.10=py36ha843d7b_0 - mkl_random=1.0.2=py36hd81dba3_0 - mpc=1.0.3=hf803216_4 - mpfr=3.1.5=h12ff648_1 - ncurses=6.1=he6710b0_1 - ninja=1.8.2=py36h6bb024c_1 - numpy=1.15.4=py36h7e9f1db_0 - numpy-base=1.15.4=py36hde5b4d6_0 - olefile=0.46=py36_0 - pcre=8.42=h439df22_0 - pillow=5.4.1=py36h34e0f95_0 - pip=19.0.1=py36_0 - pycparser=2.19=py36_0 - python=3.6.8=h0371630_0 - qt=5.6.3=h8bf5577_3 - readline=7.0=h7b6447c_5 - setuptools=40.8.0=py36_0 - six=1.12.0=py36_0 - sqlite=3.26.0=h7b6447c_0 - wheel=0.32.3=py36_0 - xz=5.2.4=h14c3975_4 - zlib=1.2.11=h7b6447c_3 - zstd=1.3.7=h0b5b093_0 - pytorch=1.0.1=py3.6_cuda9.0.176_cudnn7.4.2_2 - torchvision=0.2.1=py_2 - cloog=0.18.1=1 - gcc_49=4.9.1=6 - gmp=5.1.3=0 - pip: - correlation-cuda==0.0.0 - dask==1.1.1 - depthflowprojection-cuda==0.0.0 - filterinterpolation-cuda==0.0.0 - flowprojection-cuda==0.0.0 - interpolation-cuda==0.0.0 - interpolationch-cuda==0.0.0 - mindepthflowprojection-cuda==0.0.0 - separableconv-cuda==0.0.0 - separableconvflow-cuda==0.0.0 - torch==1.0.1.post2 prefix: /home/wenbobao/anaconda3_new/envs/pytorch1.0.0 ================================================ FILE: loss_function.py ================================================ import sys import os import sys import threading import torch from torch.autograd import Variable from lr_scheduler import * from torch.autograd import gradcheck import numpy def charbonier_loss(x,epsilon): loss = torch.mean(torch.sqrt(x * x + epsilon * epsilon)) return loss def negPSNR_loss(x,epsilon): loss = torch.mean(torch.mean(torch.mean(torch.sqrt(x * x + epsilon * epsilon),dim=1),dim=1),dim=1) return torch.mean(-torch.log(1.0/loss) /100.0) def tv_loss(x,epsilon): loss = torch.mean( torch.sqrt( (x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) ** 2 + (x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) ** 2 + epsilon *epsilon ) ) return loss def gra_adap_tv_loss(flow, image, epsilon): w = torch.exp( - torch.sum( torch.abs(image[:,:,:-1, :-1] - image[:,:,1:, :-1]) + torch.abs(image[:,:,:-1, :-1] - image[:,:,:-1, 1:]), dim = 1)) tv = torch.sum(torch.sqrt((flow[:, :, :-1, :-1] - flow[:, :, 1:, :-1]) ** 2 + (flow[:, :, :-1, :-1] - flow[:, :, :-1, 1:]) ** 2 + epsilon *epsilon) ,dim=1) loss = torch.mean( w * tv ) return loss def smooth_loss(x,epsilon): loss = torch.mean( torch.sqrt( (x[:,:,:-1,:-1] - x[:,:,1:,:-1]) **2 + (x[:,:,:-1,:-1] - x[:,:,:-1,1:]) **2+ epsilon**2 ) ) return loss def motion_sym_loss(offset, epsilon, occlusion = None): if occlusion == None: # return torch.mean(torch.sqrt( (offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2)) return torch.mean(torch.sqrt( (offset[0] + offset[1])**2 + epsilon **2)) else: # TODO: how to design the occlusion aware offset symmetric loss? # return torch.mean(torch.sqrt((offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2)) return torch.mean(torch.sqrt((offset[0] + offset[1])**2 + epsilon **2)) def part_loss(diffs, offsets, occlusions, images, epsilon, use_negPSNR=False): if use_negPSNR: pixel_loss = [negPSNR_loss(diff, epsilon) for diff in diffs] else: pixel_loss = [charbonier_loss(diff, epsilon) for diff in diffs] #offset_loss = [tv_loss(offset[0], epsilon) + tv_loss(offset[1], epsilon) for offset in # offsets] if offsets[0][0] is not None: offset_loss = [gra_adap_tv_loss(offset[0],images[0], epsilon) + gra_adap_tv_loss(offset[1], images[1], epsilon) for offset in offsets] else: offset_loss = [Variable(torch.zeros(1).cuda())] # print(torch.max(occlusions[0])) # print(torch.min(occlusions[0])) # print(torch.mean(occlusions[0])) # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion - 0.5, epsilon) for occlusion in occlusions] # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion[:, 0, ...] - occlusion[:, 1, ...], epsilon) for occlusion in occlusions] sym_loss = [motion_sym_loss(offset,epsilon=epsilon) for offset in offsets] # sym_loss = [ motion_sym_loss(offset,occlusion) for offset,occlusion in zip(offsets,occlusions)] return pixel_loss, offset_loss, sym_loss ================================================ FILE: lr_scheduler.py ================================================ from bisect import bisect_right from torch.optim.optimizer import Optimizer class _LRScheduler(object): def __init__(self, optimizer, last_epoch=-1): if not isinstance(optimizer, Optimizer): raise TypeError('{} is not an Optimizer'.format( type(optimizer).__name__)) self.optimizer = optimizer if last_epoch == -1: for group in optimizer.param_groups: group.setdefault('initial_lr', group['lr']) else: for i, group in enumerate(optimizer.param_groups): if 'initial_lr' not in group: raise KeyError("param 'initial_lr' is not specified " "in param_groups[{}] when resuming an optimizer".format(i)) self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) self.step(last_epoch + 1) self.last_epoch = last_epoch def get_lr(self): raise NotImplementedError def step(self, epoch=None): if epoch is None: epoch = self.last_epoch + 1 self.last_epoch = epoch for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): param_group['lr'] = lr class LambdaLR(_LRScheduler): """Sets the learning rate of each parameter group to the initial lr times a given function. When last_epoch=-1, sets initial lr as lr. Args: optimizer (Optimizer): Wrapped optimizer. lr_lambda (function or list): A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups. last_epoch (int): The index of last epoch. Default: -1. Example: >>> # Assuming optimizer has two groups. >>> lambda1 = lambda epoch: epoch // 30 >>> lambda2 = lambda epoch: 0.95 ** epoch >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2]) >>> for epoch in range(100): >>> scheduler.step() >>> train(...) >>> validate(...) """ def __init__(self, optimizer, lr_lambda, last_epoch=-1): self.optimizer = optimizer if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple): self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups) else: if len(lr_lambda) != len(optimizer.param_groups): raise ValueError("Expected {} lr_lambdas, but got {}".format( len(optimizer.param_groups), len(lr_lambda))) self.lr_lambdas = list(lr_lambda) self.last_epoch = last_epoch super(LambdaLR, self).__init__(optimizer, last_epoch) def get_lr(self): return [base_lr * lmbda(self.last_epoch) for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)] class StepLR(_LRScheduler): """Sets the learning rate of each parameter group to the initial lr decayed by gamma every step_size epochs. When last_epoch=-1, sets initial lr as lr. Args: optimizer (Optimizer): Wrapped optimizer. step_size (int): Period of learning rate decay. gamma (float): Multiplicative factor of learning rate decay. Default: 0.1. last_epoch (int): The index of last epoch. Default: -1. Example: >>> # Assuming optimizer uses lr = 0.5 for all groups >>> # lr = 0.05 if epoch < 30 >>> # lr = 0.005 if 30 <= epoch < 60 >>> # lr = 0.0005 if 60 <= epoch < 90 >>> # ... >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1) >>> for epoch in range(100): >>> scheduler.step() >>> train(...) >>> validate(...) """ def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1): self.step_size = step_size self.gamma = gamma super(StepLR, self).__init__(optimizer, last_epoch) def get_lr(self): return [base_lr * self.gamma ** (self.last_epoch // self.step_size) for base_lr in self.base_lrs] class MultiStepLR(_LRScheduler): """Set the learning rate of each parameter group to the initial lr decayed by gamma once the number of epoch reaches one of the milestones. When last_epoch=-1, sets initial lr as lr. Args: optimizer (Optimizer): Wrapped optimizer. milestones (list): List of epoch indices. Must be increasing. gamma (float): Multiplicative factor of learning rate decay. Default: 0.1. last_epoch (int): The index of last epoch. Default: -1. Example: >>> # Assuming optimizer uses lr = 0.5 for all groups >>> # lr = 0.05 if epoch < 30 >>> # lr = 0.005 if 30 <= epoch < 80 >>> # lr = 0.0005 if epoch >= 80 >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1) >>> for epoch in range(100): >>> scheduler.step() >>> train(...) >>> validate(...) """ def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1): if not list(milestones) == sorted(milestones): raise ValueError('Milestones should be a list of' ' increasing integers. Got {}', milestones) self.milestones = milestones self.gamma = gamma super(MultiStepLR, self).__init__(optimizer, last_epoch) def get_lr(self): return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in self.base_lrs] class ExponentialLR(_LRScheduler): """Set the learning rate of each parameter group to the initial lr decayed by gamma every epoch. When last_epoch=-1, sets initial lr as lr. Args: optimizer (Optimizer): Wrapped optimizer. gamma (float): Multiplicative factor of learning rate decay. last_epoch (int): The index of last epoch. Default: -1. """ def __init__(self, optimizer, gamma, last_epoch=-1): self.gamma = gamma super(ExponentialLR, self).__init__(optimizer, last_epoch) def get_lr(self): return [base_lr * self.gamma ** self.last_epoch for base_lr in self.base_lrs] class ReduceLROnPlateau(object): """Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by a factor of 2-10 once learning stagnates. This scheduler reads a metrics quantity and if no improvement is seen for a 'patience' number of epochs, the learning rate is reduced. Args: optimizer (Optimizer): Wrapped optimizer. mode (str): One of `min`, `max`. In `min` mode, lr will be reduced when the quantity monitored has stopped decreasing; in `max` mode it will be reduced when the quantity monitored has stopped increasing. Default: 'min'. factor (float): Factor by which the learning rate will be reduced. new_lr = lr * factor. Default: 0.1. patience (int): Number of epochs with no improvement after which learning rate will be reduced. Default: 10. verbose (bool): If True, prints a message to stdout for each update. Default: False. threshold (float): Threshold for measuring the new optimum, to only focus on significant changes. Default: 1e-4. threshold_mode (str): One of `rel`, `abs`. In `rel` mode, dynamic_threshold = best * ( 1 + threshold ) in 'max' mode or best * ( 1 - threshold ) in `min` mode. In `abs` mode, dynamic_threshold = best + threshold in `max` mode or best - threshold in `min` mode. Default: 'rel'. cooldown (int): Number of epochs to wait before resuming normal operation after lr has been reduced. Default: 0. min_lr (float or list): A scalar or a list of scalars. A lower bound on the learning rate of all param groups or each group respectively. Default: 0. eps (float): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is ignored. Default: 1e-8. Example: >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) >>> scheduler = ReduceLROnPlateau(optimizer, 'min') >>> for epoch in range(10): >>> train(...) >>> val_loss = validate(...) >>> # Note that step should be called after validate() >>> scheduler.step(val_loss) """ def __init__(self, optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=1e-4, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-8): if factor >= 1.0: raise ValueError('Factor should be < 1.0.') self.factor = factor if not isinstance(optimizer, Optimizer): raise TypeError('{} is not an Optimizer'.format( type(optimizer).__name__)) self.optimizer = optimizer if isinstance(min_lr, list) or isinstance(min_lr, tuple): if len(min_lr) != len(optimizer.param_groups): raise ValueError("expected {} min_lrs, got {}".format( len(optimizer.param_groups), len(min_lr))) self.min_lrs = list(min_lr) else: self.min_lrs = [min_lr] * len(optimizer.param_groups) self.patience = patience self.verbose = verbose self.cooldown = cooldown self.cooldown_counter = 0 self.mode = mode self.threshold = threshold self.threshold_mode = threshold_mode self.best = None self.num_bad_epochs = None self.mode_worse = None # the worse value for the chosen mode self.is_better = None self.eps = eps self.last_epoch = -1 self._init_is_better(mode=mode, threshold=threshold, threshold_mode=threshold_mode) self._reset() def _reset(self): """Resets num_bad_epochs counter and cooldown counter.""" self.best = self.mode_worse self.cooldown_counter = 0 self.num_bad_epochs = 0 def step(self, metrics, epoch=None): current = metrics if epoch is None: epoch = self.last_epoch = self.last_epoch + 1 self.last_epoch = epoch if self.is_better(current, self.best): self.best = current self.num_bad_epochs = 0 else: self.num_bad_epochs += 1 if self.in_cooldown: self.cooldown_counter -= 1 self.num_bad_epochs = 0 # ignore any bad epochs in cooldown if self.num_bad_epochs > self.patience: self._reduce_lr(epoch) self.cooldown_counter = self.cooldown self.num_bad_epochs = 0 def _reduce_lr(self, epoch): for i, param_group in enumerate(self.optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * self.factor, self.min_lrs[i]) if old_lr - new_lr > self.eps: param_group['lr'] = new_lr if self.verbose: print('Epoch {:5d}: reducing learning rate' ' of group {} to {:.4e}.'.format(epoch, i, new_lr)) @property def in_cooldown(self): return self.cooldown_counter > 0 def _init_is_better(self, mode, threshold, threshold_mode): if mode not in {'min', 'max'}: raise ValueError('mode ' + mode + ' is unknown!') if threshold_mode not in {'rel', 'abs'}: raise ValueError('threshold mode ' + mode + ' is unknown!') if mode == 'min' and threshold_mode == 'rel': rel_epsilon = 1. - threshold self.is_better = lambda a, best: a < best * rel_epsilon self.mode_worse = float('Inf') elif mode == 'min' and threshold_mode == 'abs': self.is_better = lambda a, best: a < best - threshold self.mode_worse = float('Inf') elif mode == 'max' and threshold_mode == 'rel': rel_epsilon = threshold + 1. self.is_better = lambda a, best: a > best * rel_epsilon self.mode_worse = -float('Inf') else: # mode == 'max' and epsilon_mode == 'abs': self.is_better = lambda a, best: a > best + threshold self.mode_worse = -float('Inf') ================================================ FILE: my_args.py ================================================ import os import datetime import argparse import numpy import networks import torch modelnames = networks.__all__ # import datasets datasetNames = ('Vimeo_90K_interp') #datasets.__all__ parser = argparse.ArgumentParser(description='DAIN') parser.add_argument('--debug',action = 'store_true', help='Enable debug mode') parser.add_argument('--netName', type=str, default='DAIN', choices = modelnames,help = 'model architecture: ' + ' | '.join(modelnames) + ' (default: DAIN)') parser.add_argument('--datasetName', default='Vimeo_90K_interp', choices= datasetNames,nargs='+', help='dataset type : ' + ' | '.join(datasetNames) + ' (default: Vimeo_90K_interp)') parser.add_argument('--datasetPath',default='',help = 'the path of selected datasets') parser.add_argument('--dataset_split', type = int, default=97, help = 'Split a dataset into trainining and validation by percentage (default: 97)') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--numEpoch', '-e', type = int, default=100, help= 'Number of epochs to train(default:150)') parser.add_argument('--batch_size', '-b',type = int ,default=1, help = 'batch size (default:1)' ) parser.add_argument('--workers', '-w', type =int,default=8, help = 'parallel workers for loading training samples (default : 1.6*10 = 16)') parser.add_argument('--channels', '-c', type=int,default=3,choices = [1,3], help ='channels of images (default:3)') parser.add_argument('--filter_size', '-f', type=int, default=4, help = 'the size of filters used (default: 4)', choices=[2,4,6, 5,51] ) parser.add_argument('--lr', type =float, default= 0.002, help= 'the basic learning rate for three subnetworks (default: 0.002)') parser.add_argument('--rectify_lr', type=float, default=0.001, help = 'the learning rate for rectify/refine subnetworks (default: 0.001)') parser.add_argument('--save_which', '-s', type=int, default=1, choices=[0,1], help='choose which result to save: 0 ==> interpolated, 1==> rectified') parser.add_argument('--time_step', type=float, default=0.5, help='choose the time steps') parser.add_argument('--flow_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)') parser.add_argument('--occ_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)') parser.add_argument('--filter_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)') parser.add_argument('--ctx_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)') parser.add_argument('--depth_lr_coe', type = float, default=0.001, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)') # parser.add_argument('--deblur_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)') parser.add_argument('--alpha', type=float,nargs='+', default=[0.0, 1.0], help= 'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])') parser.add_argument('--epsilon', type = float, default=1e-6, help = 'the epsilon for charbonier loss,etc (default: 1e-6)') parser.add_argument('--weight_decay', type = float, default=0, help = 'the weight decay for whole network ' ) parser.add_argument('--patience', type=int, default=5, help = 'the patience of reduce on plateou') parser.add_argument('--factor', type = float, default=0.2, help = 'the factor of reduce on plateou') # parser.add_argument('--pretrained', dest='SAVED_MODEL', default=None, help ='path to the pretrained model weights') parser.add_argument('--no-date', action='store_true', help='don\'t append date timestamp to folder' ) parser.add_argument('--use_cuda', default= True, type = bool, help='use cuda or not') parser.add_argument('--use_cudnn',default=1,type=int, help = 'use cudnn or not') parser.add_argument('--dtype', default=torch.cuda.FloatTensor, choices = [torch.cuda.FloatTensor,torch.FloatTensor],help = 'tensor data type ') # parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint (default: none)') parser.add_argument('--uid', type=str, default= None, help='unique id for the training') parser.add_argument('--force', action='store_true', help='force to override the given uid') # Colab version parser.add_argument('--start_frame', type = int, default = 1, help='first frame number to process') parser.add_argument('--end_frame', type = int, default = 100, help='last frame number to process') parser.add_argument('--frame_input_dir', type = str, default = '/content/DAIN/input_frames', help='frame input directory') parser.add_argument('--frame_output_dir', type = str, default = '/content/DAIN/output_frames', help='frame output directory') args = parser.parse_args() import shutil if args.uid == None: unique_id = str(numpy.random.randint(0, 100000)) print("revise the unique id to a random numer " + str(unique_id)) args.uid = unique_id timestamp = datetime.datetime.now().strftime("%a-%b-%d-%H-%M") save_path = './model_weights/'+ args.uid +'-' + timestamp else: save_path = './model_weights/'+ str(args.uid) # print("no pth here : " + save_path + "/best"+".pth") if not os.path.exists(save_path + "/best"+".pth"): # print("no pth here : " + save_path + "/best" + ".pth") os.makedirs(save_path,exist_ok=True) else: if not args.force: raise("please use another uid ") else: print("override this uid" + args.uid) for m in range(1,10): if not os.path.exists(save_path+"/log.txt.bk" + str(m)): shutil.copy(save_path+"/log.txt", save_path+"/log.txt.bk"+str(m)) shutil.copy(save_path+"/args.txt", save_path+"/args.txt.bk"+str(m)) break parser.add_argument('--save_path',default=save_path,help = 'the output dir of weights') parser.add_argument('--log', default = save_path+'/log.txt', help = 'the log file in training') parser.add_argument('--arg', default = save_path+'/args.txt', help = 'the args used') args = parser.parse_args() with open(args.log, 'w') as f: f.close() with open(args.arg, 'w') as f: print(args) print(args,file=f) f.close() if args.use_cudnn: print("cudnn is used") torch.backends.cudnn.benchmark = True # to speed up the else: print("cudnn is not used") torch.backends.cudnn.benchmark = False # to speed up the ================================================ FILE: my_package/DepthFlowProjection/DepthFlowProjectionLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function #import _ext.my_lib as my_lib import depthflowprojection_cuda as my_lib class DepthFlowProjectionLayer(Function): def __init__(self,requires_grad): super(DepthFlowProjectionLayer,self).__init__() # self.requires_grad = requires_grad @staticmethod def forward(ctx, input1, input2, requires_grad): # print("Depth Aware Flow Projection") assert(input1.is_contiguous()) assert(input2.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it # self.input2 = input2.contiguous() fillhole = 1 if requires_grad == False else 0 # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections # output = torch.zeros(input1.size()) if input1.is_cuda: # output = output.cuda() # count = count.cuda() # print("correct") count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() err = my_lib.DepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole) else: # output = torch.cuda.FloatTensor(input1.data.size()) count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_() output = torch.FloatTensor().resize_(input1.size()).zero_() err = my_lib.DepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole) if err != 0: print(err) # output = output/count # to divide the counter # self.count = count #to keep this # self.output = output ctx.save_for_backward(input1, input2,count,output) ctx.fillhole = fillhole # print(self.input1[0, 0, :10, :10]) # print(self.count[0, 0, :10, :10]) # print(self.input1[0, 0, -10:, -10:]) # print(self.count[0, 0, -10:, -10:]) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Filter Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) input1, input2, count, output = ctx.saved_tensors # fillhole = ctx.fillhole if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_() err = my_lib.DepthFlowProjectionLayer_gpu_backward(input1,input2, count, output, gradoutput, gradinput1,gradinput2) # print(err) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_() err = my_lib.DepthFlowProjectionLayer_cpu_backward(input1, input2, count, output, gradoutput, gradinput1,gradinput2) # print(err) if err != 0: print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1,gradinput2,None ================================================ FILE: my_package/DepthFlowProjection/DepthFlowProjectionModule.py ================================================ # modules/FlowProjectionModule.py from torch.nn.modules.module import Module from .DepthFlowProjectionLayer import DepthFlowProjectionLayer #, FlowFillholeLayer __all__ =['DepthFlowProjectionModule'] class DepthFlowProjectionModule(Module): def __init__(self, requires_grad = True): super(DepthFlowProjectionModule, self).__init__() self.requires_grad = requires_grad # self.f = DepthFlowProjectionLayer(requires_grad) def forward(self, input1, input2): return DepthFlowProjectionLayer.apply(input1, input2,self.requires_grad) # class FlowFillholeModule(Module): # def __init__(self,hole_value = -10000.0): # super(FlowFillholeModule, self).__init__() # self.f = FlowFillholeLayer() # # def forward(self, input1): # return self.f(input1) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/DepthFlowProjection/__init__.py ================================================ from .DepthFlowProjectionModule import * ================================================ FILE: my_package/DepthFlowProjection/depthflowprojection_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "depthflowprojection_cuda_kernel.cuh" int DepthFlowProjectionLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, int fillhole ) { int error = 1 ; int channel = input1.size( 1); if(channel!= 2) return error; int batch = input1.size(0); int h = input1.size(2); int w = input1.size(3); if(input2.size(1) !=1 ) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); // printf("In gpu forward\n"); error = DepthFlowProjection_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch,fillhole, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, input2, count, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int DepthFlowProjectionLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=2) return error; int batch = input1.size(0); if(count.size( 0) != batch) return error; if(count.size(1) != 1) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(1) !=1 ) return error; if(count.size(2) != h) return error;// to add some checkpoint if(count.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); // printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = DepthFlowProjection_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, input2, count, output, gradoutput, gradinput1, gradinput2 ); if (error) {AT_ERROR("CUDA call failed");} //printf("Am I good in backward function %d",error); return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("DepthFlowProjectionLayer_gpu_forward", &DepthFlowProjectionLayer_gpu_forward, "DepthFlowProjection forward (CUDA)"); m.def("DepthFlowProjectionLayer_gpu_backward", &DepthFlowProjectionLayer_gpu_backward, "DepthFlowProjection backward (CUDA)"); } ================================================ FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cu ================================================ #include #include "depthflowprojection_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void DepthFlowProjection_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ]; float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ]; float x2 = (float) (w_i) + fx; float y2 = (float) (h_i) + fy; if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){ int ix2_L = (int) (x2); int iy2_T = (int) (y2); int ix2_R = min(ix2_L + 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i]; atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,- temp * fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-temp * fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-temp * fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-temp * fx); atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -temp * fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] , -temp * fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] , -temp * fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] , -temp * fy); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], temp * 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ,temp * 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , temp * 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ,temp * 1); } } return ; } template __global__ void DepthFlowProjectionAveraging_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ; if(temp > 0.0f){ output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp; output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp; } } return ; } template __global__ void DepthFlowFillhole_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ; if(temp <= 0.0f){ //search along the four directions,0/90/180/270, until finding at least one int left_offset = w_i; float left_temp = 0.0f; while(left_temp == 0.0f && left_offset - 1 >= 0){ left_offset = left_offset - 1; left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ; } int right_offset = w_i ; float right_temp = 0.0f; while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){ right_offset = right_offset + 1 ; right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ; } int up_offset = h_i ; float up_temp = 0.0f; while(up_temp == 0.0f && up_offset - 1 >=0){ up_offset = up_offset - 1; up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ; } int down_offset = h_i; float down_temp = 0.0f; while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){ down_offset = down_offset + 1; down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ; } if(left_temp + right_temp + up_temp + down_temp <=0.0f){ //printf("Can't fill hole, find no neighbor vectors availabel\n"); return; } left_temp = (left_temp > 0.0f)?1:0; right_temp = (right_temp > 0.0f)?1:0; up_temp = (up_temp > 0.0f)?1:0; down_temp = (down_temp > 0.0f)?1:0; output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = ( left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =( left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; } } return ; } template __global__ void DepthFlowProjection_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1, scalar_t* gradinput2 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ; float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ; float x2 = (float) ( w_i ) + fx; float y2 = (float) ( h_i ) + fy; if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){ int ix2_L = (int)(x2); int iy2_T = (int)(y2); int ix2_R = min(ix2_L + 1, w-1); int iy2_B = min(iy2_T + 1, h-1); float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i]; int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i; gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp / count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ; gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] * temp / count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ; gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ; gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp / count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ; int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] * temp / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ; int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i; gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] * (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ); gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] * (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ); gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] * (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ); gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] * (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] ); gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] * (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ); gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] / count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] * (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ); gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] * (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ); gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] / count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] * (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] ); } } return ; } int DepthFlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output ) { int error = -1; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // printf("I am here\n"); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] { DepthFlowProjection_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am there\n"); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjectionAveraging", ([&] { DepthFlowProjectionAveraging_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data() ); })); // printf("I am kao\n"); // THCudaCheck(cudaGetLastError()); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am dd\n"); if(fillhole){ // printf("use flow fill hole\n"); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowFillhole", ([&] { DepthFlowFillhole_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data() ); })); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); return error; } } error = 0; return error; } int DepthFlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = -1; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { DepthFlowProjection_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data(), gradoutput.data(), gradinput1.data(), gradinput2.data() ); })); // printf("gpu I am there\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("gpu I am here\n"); error = 0; return error; } ================================================ FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cuh ================================================ #pragma once #include #include #include int DepthFlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output ); int DepthFlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ); ================================================ FILE: my_package/DepthFlowProjection/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='depthflowprojection_cuda', ext_modules=[ CUDAExtension('depthflowprojection_cuda', [ 'depthflowprojection_cuda.cc', 'depthflowprojection_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/FilterInterpolation/FilterInterpolationLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import filterinterpolation_cuda as my_lib #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class FilterInterpolationLayer(Function): def __init__(self): super(FilterInterpolationLayer,self).__init__() @staticmethod def forward(ctx, input1,input2,input3): assert(input1.is_contiguous()) assert(input2.is_contiguous()) assert (input3.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy? # self.input3 = input3.contiguous() # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # output = torch.zeros(input1.size()) if input1.is_cuda : # output = output.cuda() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() my_lib.FilterInterpolationLayer_gpu_forward(input1, input2, input3, output) else: output = torch.FloatTensor(input1.data.size()) my_lib.FilterInterpolationLayer_cpu_forward(input1, input2, input3, output) ctx.save_for_backward(input1, input2,input3) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Filter Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) # gradinput2 = torch.zeros(self.input2.size()) # gradinput3 = torch.zeros(self.input3.size()) input1, input2, input3= ctx.saved_tensors gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_() gradinput3 = torch.cuda.FloatTensor().resize_(input3.size()).zero_() if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) # gradinput2 = gradinput2.cuda(self.device) # gradinput3 = gradinput3.cuda(self.device) err = my_lib.FilterInterpolationLayer_gpu_backward(input1,input2, input3, gradoutput, gradinput1, gradinput2, gradinput3) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) err = my_lib.FilterInterpolationLayer_cpu_backward(input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, gradinput2,gradinput3 # calculate the weights of flow class WeightLayer(Function): def __init__(self, lambda_e = 10.0/255.0, lambda_v = 1.0, Nw = 3): #lambda_e = 10.0 , lambda_v = 1.0, Nw = 3, super(WeightLayer,self).__init__() self.lambda_e = lambda_e self.lambda_v = lambda_v self.Nw = Nw # flow1_grad def forward(self, input1,input2,input3): # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) self.input1 = input1.contiguous() # ref1 image self.input2 = input2.contiguous() # ref2 image self.input3 = input3.contiguous() # self.flow1_grad = flow1_grad.contiguous() # ref1 flow's grad if input1.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 output = torch.zeros(input1.size(0), 1 , input1.size(2), input1.size(3)) if input1.is_cuda : output = output.cuda() err = my_lib.WeightLayer_gpu_forward(input1, input2, input3, # flow1_grad, output, self.lambda_e, self.lambda_v, self.Nw ) if err != 0 : print(err) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.WeightLayer_cpu_forward(input1, input2, input3, output, self.lambda_e , self.lambda_v, self.Nw ) if err != 0 : print(err) self.output = output # save this for fast back propagation # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): # print("Backward of WeightLayer Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() gradinput1 = torch.zeros(self.input1.size()) gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) # gradflow1_grad = torch.zeros(self.flow1_grad.size()) if self.input1.is_cuda: #print("CUDA backward") gradinput1 = gradinput1.cuda(self.device) gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) # gradflow1_grad = gradflow1_grad.cuda(self.device) err = my_lib.WeightLayer_gpu_backward( self.input1,self.input2,self.input3, self.output, gradoutput, gradinput1, gradinput2, gradinput3, self.lambda_e, self.lambda_v, self.Nw ) if err != 0 : print(err) else: #print("CPU backward") # print(gradoutput) err = my_lib.WeightLayer_cpu_backward( self.input1, self.input2,self.input3, self.output, gradoutput, gradinput1, gradinput2, gradinput3, self.lambda_e, self.lambda_v, self.Nw ) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print("from 1:") # print(gradinput3[0,0,...]) return gradinput1, gradinput2, gradinput3 class PixelValueLayer(Function): def __init__(self, sigma_d = 3, tao_r = 0.05, Prowindow = 2 ): super(PixelValueLayer,self).__init__() self.sigma_d = sigma_d self.tao_r = tao_r #maybe not useable self.Prowindow = Prowindow def forward(self, input1, input3, flow_weights): # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) self.input1 = input1.contiguous() # ref1 image #self.input2 = input2.contiguous() # ref2 image self.input3 = input3.contiguous() # ref1 flow self.flow_weights = flow_weights.contiguous() # ref1 flow weights if input1.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 output = torch.zeros(input1.size()) if input1.is_cuda: output = output.cuda() err = my_lib.PixelValueLayer_gpu_forward( input1, input3, flow_weights, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.PixelValueLayer_cpu_forward( input1, input3, flow_weights, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): # print("Backward of PixelValueLayer Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() gradinput1 = torch.zeros(self.input1.size()) #gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) gradflow_weights = torch.zeros(self.flow_weights.size()) if self.input1.is_cuda: # print("CUDA backward") gradinput1 = gradinput1.cuda(self.device) #gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) gradflow_weights = gradflow_weights.cuda(self.device) err = my_lib.PixelValueLayer_gpu_backward( self.input1,self.input3, self.flow_weights, gradoutput, gradinput1, gradinput3, gradflow_weights, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: #print("CPU backward") # print(gradoutput) err = my_lib.PixelValueLayer_cpu_backward( self.input1, self.input3, self.flow_weights, gradoutput, gradinput1, gradinput3, gradflow_weights, self.sigma_d, self.tao_r , self.Prowindow ) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print("from 2:") # print(gradinput3[0,0,...]) # print("Image grad:") # print(gradinput1[0,:,:4,:4]) # print("Flow grad:") # print(gradinput3[0,:,:4,:4]) # print("Flow_weights grad:") # print(gradflow_weights[0,:,:4,:4]) return gradinput1, gradinput3, gradflow_weights class PixelWeightLayer(Function): def __init__(self,threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ): super(PixelWeightLayer,self).__init__() self.threshhold = threshhold self.sigma_d = sigma_d self.tao_r = tao_r #maybe not useable self.Prowindow = Prowindow def forward(self, input3, flow_weights): # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) #self.input1 = input1.contiguous() # ref1 image #self.input2 = input2.contiguous() # ref2 image self.input3 = input3.contiguous() # ref1 flow self.flow_weights = flow_weights.contiguous() # ref1 flow weights if input3.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 output = torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)]) if input3.is_cuda : output = output.cuda() err = my_lib.PixelWeightLayer_gpu_forward( input3, flow_weights, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.PixelWeightLayer_cpu_forward( input3, flow_weights, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) self.output = output # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): # print("Backward of PixelWeightLayer Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() #gradinput1 = torch.zeros(self.input1.size()) #gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) gradflow_weights = torch.zeros(self.flow_weights.size()) if self.input3.is_cuda: # print("CUDA backward") #gradinput1 = gradinput1.cuda(self.device) #gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) gradflow_weights = gradflow_weights.cuda(self.device) err = my_lib.PixelWeightLayer_gpu_backward( self.input3, self.flow_weights, self.output, gradoutput, gradinput3, gradflow_weights, self.threshhold, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) err = my_lib.PixelWeightLayer_cpu_backward( self.input3, self.flow_weights, self.output, gradoutput, gradinput3, gradflow_weights, self.threshhold, self.sigma_d, self.tao_r , self.Prowindow ) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print("from 3:") # print(gradinput3[0,0,...]) return gradinput3, gradflow_weights #class ReliableValueLayer(Function): # def __init__(self, Nw =3, tao_r =0.05, Prowindow = 2 ): # super(ReliableValueLayer,self).__init__() # # self.Nw = Nw # self.tao_r = tao_r #maybe not useable # self.Prowindow = Prowindow # # def forward(self, input3, flow_weight1): # # # assert(input1.is_contiguous()) # # assert(input2.is_contiguous()) # #self.input1 = input1.contiguous() # ref1 image # #self.input2 = input2.contiguous() # ref2 image # self.input3 = input3.contiguous() # ref1 flow # self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights # # if input3.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # # output = torch.zeros([intpu3.size(0), 1, input3.size(2), input3.size(3)]) # #output2 = torch.zeros(input1.size()) # #weight1 = torch.zeros(input1.size()) # #weight2 = torch.zeros(input1.size()) # # # if input1.is_cuda : # output = output.cuda() # my_lib.ReliableValueLayer_gpu_forward( # input3, flow_weight1, output, # self.sigma_d, self.tao_r , self.Prowindow ) # else: # # output = torch.cuda.FloatTensor(input1.data.size()) # my_lib.ReliableValueLayer_cpu_forward( # input3, flow_weight1, output, # self.sigma_d, self.tao_r , self.Prowindow ) # # # the function returns the output to its caller # return output # # #TODO: if there are multiple outputs of this function, then the order should be well considered? # def backward(self, gradoutput): # # print("Backward of Filter Interpolation Layer") # # gradinput1 = input1.new().zero_() # # gradinput2 = input2.new().zero_() # #gradinput1 = torch.zeros(self.input1.size()) # #gradinput2 = torch.zeros(self.input2.size()) # gradinput3 = torch.zeros(self.input3.size()) # gradflow_weight1 = torch.zeros(self.flow_weight1.size()) # # if self.input1.is_cuda: # # print("CUDA backward") # #gradinput1 = gradinput1.cuda(self.device) # #gradinput2 = gradinput2.cuda(self.device) # gradinput3 = gradinput3.cuda(self.device) # gradflow_weight1 = gradflow_weight1.cuda(self.device) # # err = my_lib.ReliableValueLayer_gpu_backward( # self.input3, self.flow_weight1, gradoutput, # gradinput3, gradflow_weight1, # self.sigma_d, self.tao_r , self.Prowindow ) # if err != 0 : # print(err) # # else: # # print("CPU backward") # # print(gradoutput) # err = my_lib.ReliableValueLayer_cpu_backward( # self.input3,self.flow_weight1, gradoutput, # gradinput3, gradflow_weight1, # self.sigma_d, self.tao_r , self.Prowindow ) # # print(err) # if err != 0 : # print(err) # # print(gradinput1) # # print(gradinput2) # # # print(gradinput1) # # return gradinput3,gradflow_weight1 class ReliableWeightLayer(Function): def __init__(self, threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ): super(ReliableWeightLayer,self).__init__() self.threshhold = threshhold self.sigma_d = sigma_d self.tao_r = tao_r #maybe not useable self.Prowindow = Prowindow def forward(self, input3): # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) #self.input1 = input1.contiguous() # ref1 image #self.input2 = input2.contiguous() # ref2 image self.input3 = input3.contiguous() # ref1 flow #self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights if input3.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 output = torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)] ) #output2 = torch.zeros(input1.size()) #weight1 = torch.zeros(input1.size()) #weight2 = torch.zeros(input1.size()) if input3.is_cuda : output = output.cuda() err = my_lib.ReliableWeightLayer_gpu_forward( input3, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.ReliableWeightLayer_cpu_forward( input3, output, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) self.output= output # used for inihibiting some unreliable gradients. # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): #print("Backward of ReliableWeightLayer Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() #gradinput1 = torch.zeros(self.input1.size()) #gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) #gradflow_weight1 = torch.zeros(self.flow_weight1.size()) if self.input3.is_cuda: #print("CUDA backward") #gradinput1 = gradinput1.cuda(self.device) #gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) #gradflow_weight1 = gradflow_weight1.cuda(self.device) err = my_lib.ReliableWeightLayer_gpu_backward( self.input3, self.output, gradoutput, gradinput3, self.threshhold, self.sigma_d, self.tao_r , self.Prowindow ) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) err = my_lib.ReliableWeightLayer_cpu_backward( self.input3, self.output, gradoutput, gradinput3, self.threshhold, self.sigma_d, self.tao_r , self.Prowindow ) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print("from 4:") # print(gradinput3[0,0,...]) return gradinput3 ================================================ FILE: my_package/FilterInterpolation/FilterInterpolationModule.py ================================================ # modules/AdaptiveInterpolationLayer.py from torch.nn import Module import torch from torch.autograd import Variable from torch.autograd import gradcheck from .FilterInterpolationLayer import FilterInterpolationLayer,WeightLayer, PixelValueLayer,PixelWeightLayer,ReliableWeightLayer class FilterInterpolationModule(Module): def __init__(self): super(FilterInterpolationModule, self).__init__() # self.f = FilterInterpolationLayer() def forward(self, input1, input2, input3): return FilterInterpolationLayer.apply(input1, input2, input3) #we actually dont need to write the backward code for a module, since we have #class WeightModule(Module): # def __init__(self): # super(WeightModule, self).__init__() # self.f = WeightLayer() # # def forward(self, input1, input2, input3): # return self.f(input1, input2, input3) class AdaptiveWeightInterpolationModule(Module): def __init__(self, training = False, threshhold = 1e-6, lambda_e = 30.0/255.0, lambda_v = 1.0, Nw = 3.0, sigma_d =1.5, tao_r = 0.05, Prowindow = 2 ): super(AdaptiveWeightInterpolationModule, self).__init__() self.calc_weight1 = WeightLayer(lambda_e, lambda_v, Nw ) self.padder1 = torch.nn.ReplicationPad2d([0, 1 , 0, 1]) self.interpolate1 = PixelValueLayer(sigma_d, tao_r , Prowindow) self.interpolate1_1 = PixelWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow) # self.interpolate_R1 = ReliableValueLayer(Nw, tao_r , Prowindow) self.interpolate_R1_1 = ReliableWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow) self.calc_weight2 = WeightLayer(lambda_e, lambda_v,Nw) self.padder2 = torch.nn.ReplicationPad2d([0, 1 , 0, 1]) self.interpolate2 = PixelValueLayer(sigma_d, tao_r , Prowindow ) self.interpolate2_1 = PixelWeightLayer(101*threshhold,sigma_d,tao_r, Prowindow) #self.interpolate_R2 = ReliableValueLayer(Nw, tao_r , Prowindow) self.interpolate_R2_1 = ReliableWeightLayer(101*threshhold, sigma_d,tao_r, Prowindow) self.training = training self.threshold = threshhold return #self.lambda_e = lambda_e #self.lambda_v = lambda_v #self.sigma_d = sigma_d #self.Nw = Nw #self.tao_r = tao_r #maybe not useable #self.Prowindow = Prowindow # lambda_e = self.lambda_e , lambda_v = self.lambda_v,Nw = self.Nw # sigma_d = self.sigma_d, tao_r = self.tao_r , Prowindow = self.Prowindow #self.sigma_d, self.tao_r , self.Prowindow # input1 ==> ref1 image # #input2 ==> ref2 image # input3 ==> ref1 flow # input4 ==> ref2 flow def forward(self, input1, input2, input3, input4): epsilon = 1e-6 #flow1_grad = torch.sum(torch.sqrt( # (input3[:, :, :-1, :-1] - input3[:, :, 1:, :-1]) ** 2 + # (input3[:, :, :-1, :-1] - input3[:, :, :-1, 1:]) ** 2 + epsilon * epsilon # ), dim = 1,keepdim =True) #flow1_grad = self.padder1(flow1_grad) # if input1.is_cuda: # err = gradcheck(self.calc_weight1,(Variable(input1.data,requires_grad=True), # Variable(input2 .data,requires_grad=True), # Variable(input3.data,requires_grad= True), # # Variable(flow1_grad.data,requires_grad=True) # ), eps=1e-3) # print(err) # pass #input1.requires_grad = True #input2.requires_grad = True flow_weight1 = self.calc_weight1(input1,input2,input3 ) # if flow1_grad.is_cuda: # err = gradcheck(self.interpolate1,(Variable(input1.data,requires_grad=True), # Variable(input3.data,requires_grad= True), # Variable(flow_weight1.data,requires_grad=True)), eps=1e-3) # err = gradcheck(self.interpolate1_1, (Variable(input3.data,requires_grad=True), # Variable(flow_weight1.data, requires_grad =True)),eps=1e-3) # err = gradcheck(self.interpolate_R1_1,(input3,),eps=1e-3) # print(err) # print(flow_weight1[0,:,50:100,50:100]) p1 = self.interpolate1(input1, input3, flow_weight1) p1_r,p1_g,p1_b = torch.split(p1,1,dim=1) pw1 = self.interpolate1_1(input3, flow_weight1) i1_r,i1_g,i1_b = (p1_r)/(pw1+self.threshold),\ (p1_g)/(pw1+self.threshold), \ (p1_b)/(pw1+self.threshold) #if not self.training: # i1_r[pw1<=10*self.threshold], i1_g[pw1<=10*self.threshold], i1_b[pw1<=10*self.threshold] = 0,0,0 #i1 = torch.cat((i1_r,i1_g,i1_b),dim=1 #r1 = self.interpolate_R1(input3, flow_weight1) r1 = pw1 rw1 = self.interpolate_R1_1(input3) w1 = (r1)/(rw1+self.threshold) # if torch.sum(w1 <= 0).cpu().data.numpy()[0] > 0: # pass # print("there are holes in i1 :" ) # print(torch.sum(w1 <= 0)) #if not self.training: # w1[rw1 <=10*self.threshold] = 0 # flow2_grad = torch.sum(torch.sqrt( # (input4[:, :, :-1, :-1] - input4[:, :, 1:, :-1]) ** 2 + # (input4[:, :, :-1, :-1] - input4[:, :, :-1, 1:]) ** 2 + epsilon * epsilon # ), dim = 1,keepdim=True) # flow2_grad = self.padder2(flow2_grad) flow_weight2 = self.calc_weight2(input2,input1,input4) p2 = self.interpolate2(input2, input4, flow_weight2) p2_r,p2_g,p2_b = torch.split(p2,1,dim=1) pw2 = self.interpolate2_1(input4, flow_weight2) i2_r,i2_g,i2_b = (p2_r)/(pw2+self.threshold),\ (p2_g)/(pw2+self.threshold), \ (p2_b)/(pw2+self.threshold) #if not self.training: # i2_r[pw2<=10*self.threshold], i2_g[pw2<=10*self.threshold], i2_b[pw2<=10*self.threshold] = 0,0,0 #i2 = torch.cat((p2[:,0,...] /pw2, p2[:,1,...] /pw2, p2[:,2,...]/pw2),dim=1) #r2 = self.interpolate_R2(input4, flow_weight2) r2 = pw2 rw2 = self.interpolate_R2_1(input4) w2 = (r2)/(rw2+self.threshold) #if torch.sum(w2 <= 0).cpu().data.numpy()[0] > 0: # pass # print("there are holes in i2 :" ) # print(torch.sum(w2 <= 0)) #if not self.training: # w2[rw2 <= 10*self.threshold] = 0 # i = (i1 * w1 + i2 * w2 )/ (w1 + w2) w = w1+w2 i_r = (i1_r * w1 + i2_r * w2)/ (w + self.threshold) #(w1 + w2) i_g = (i1_g * w1 + i2_g * w2)/ (w + self.threshold) #(w1 + w2) i_b = (i1_b * w1 + i2_b * w2)/ (w + self.threshold) #(w1 + w2) #if torch.sum(w <= 0).cpu().data.numpy()[0] > 0: # print("there are holes in i :") # print(torch.sum(w <= 0)) if not self.training: i_r[w<= 10*self.threshold], i_g[w<=10*self.threshold], i_b[w<=10*self.threshold] = 0,0,0 w[w <= 10 *self.threshold] = 0 i = torch.cat((i_r,i_g,i_b),dim=1) return i ================================================ FILE: my_package/FilterInterpolation/__init__.py ================================================ from .FilterInterpolationModule import * ================================================ FILE: my_package/FilterInterpolation/filterinterpolation_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "filterinterpolation_cuda_kernel.cuh" int FilterInterpolationLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output ) { int error = 1 ; int channel = input1.size( 1); //if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int filter_size2 = input3.size( 1); int filter_size = (int) sqrt((float) filter_size2); // printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2)); int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); // printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); error = FilterInterpolationLayer_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, input1, input2, input3, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int FilterInterpolationLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; int channel = input1.size( 1); //if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int filter_size2 = input3.size( 1); int filter_size = (int) sqrt((float) filter_size2); // printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2)); int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); // printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input2_b_stride != gradinput2.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; if(input2_c_stride != gradinput2.stride(1)) return error; if(input3_c_stride != gradinput3.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = FilterInterpolationLayer_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("FilterInterpolationLayer_gpu_forward", &FilterInterpolationLayer_gpu_forward, "FilterInterpolation forward (CUDA)"); m.def("FilterInterpolationLayer_gpu_backward", &FilterInterpolationLayer_gpu_backward, "FilterInterpolation backward (CUDA)"); } ================================================ FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cu ================================================ #include #include "filterinterpolation_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void FilterInterpolationLayer_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ]; float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ]; float x2 = (float)(w_i) + fx; float y2 = (float)(h_i) + fy; if(x2 >= 0.0f && y2 >=0.0f && x2 <= (float)(w -1) && y2 <= (float)(h-1) && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){ int ix2_L = int(x2) + 1 - (int)(filter_size / 2); int iy2_T = int(y2) + 1 - (int)(filter_size / 2); int ix2_R = ix2_L + filter_size; int iy2_B = iy2_T + filter_size; float alpha = x2 - (int)(x2); float beta = y2 - (int)(y2); //TODO: here is a bug that if the iy2_B or ix2_R gets out of the border, than there is no enough pixels to warp the target one. for (int c_i = 0 ; c_i < channel ; c_i++){ float TL = 0.0f; for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){ int _filter_j = min(max(0, filter_j), h - 1); for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){ int _filter_i = min(max(0, filter_i ), w - 1); TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ; } } float TR = 0.0f; for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BL = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BR = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] = (1-alpha)*(1-beta)*TL + alpha*(1-beta)*TR + (1-alpha)*beta*BL + alpha*beta*BR; // for( int filter_i = ix2_L; filter_i < ix2_R ; filter_i ++ ){ // int _filter_i = min(max(0, filter_i),w - 1); // output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] += // input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] * // input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] * //// exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)); // the distance weight // exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ); // the distance weight // //// if(w_i == 141 && h_i == 316 && c_i == 0 ){ ////printf("gpu: %f, %f,%f,%f\n",input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] , ////input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i], ////exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)), ////output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] //// ); ////} // // } // } } } else{ //the warping data is out of range, we fill it with zeros for(int c_i = 0 ; c_i < channel; c_i ++){ output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = input1[off + c_i* input1_c_stride+ h_i * input1_h_stride + w_i]; } } } return ; } template __global__ void FilterInterpolationLayer_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, scalar_t* gradoutput, scalar_t* gradinput1, scalar_t* gradinput2, scalar_t* gradinput3 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i]; float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]; float x2 = float(w_i) + fx; float y2 = float(h_i) + fy; if(x2 >= 0.0f && y2 >= 0.0f && x2 <= (float)(w - 1) && y2 <= (float)(h -1) && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){ int ix2_L = int(x2) + 1 - (int) (filter_size/2); int iy2_T = int(y2) + 1 - (int) (filter_size/2); int ix2_R = ix2_L + filter_size; int iy2_B = iy2_T + filter_size; float alpha = x2 - (int)(x2); float beta = y2 - (int)(y2); /*** Step 1: calculate the gradients for input1, i.e. the input image; ***/ /*** STEP 3: calculate the gradients for input3, i.e. the filter ***/ /*** Step 1 and Step 3 are simultaneously computed ***/ for (int c_i = 0 ; c_i < channel; c_i++){ float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; float TL_grad = gradoutput_value * (1-alpha ) * (1-beta); for(int filter_j = iy2_T; filter_j <= (int) (y2) ; filter_j ++ ){ int _filter_j = min(max(0, filter_j), h - 1); for (int filter_i = ix2_L ; filter_i <= (int)(x2) ; filter_i ++){ int _filter_i = min(max(0, filter_i), w - 1); atomicAdd( &gradinput1[off +c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ], TL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]); atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i], TL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]); } } float TR_grad= gradoutput_value * alpha * ( 1- beta); for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ], TR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]); atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i], TR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]); } } float BL_grad = gradoutput_value * ( 1 - alpha ) * beta; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ], BL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]); atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i], BL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]); } } float BR_grad = gradoutput_value * alpha * beta; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ], BR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]); atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i], BR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]); } } // for ( int filter_j = iy2_T; filter_j < iy2_B ; filter_j ++ ){ // int _filter_j = min(max(0, filter_j), h - 1); // for( int filter_i = ix2_L; filter_i< ix2_R ; filter_i++){ // int _filter_i = min(max(0,filter_i), w - 1); // atomicAdd( & gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i], // gradoutput_value * // input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i] * //// exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2))/(float)filter_size) // exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2))) // // ); // } // } } /*** Step 2: calculate the gradients for input2, i.e., the optical flow, STEP 2.1: for the x/horizonotal direction. ***/ float gamma = 1.0f - beta; //iy2_B - y2; float bot_diff = 0.0f; for(int c_i =0 ; c_i< channel; c_i ++ ){ float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; float TL = 0.0f; for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){ int _filter_j = min(max(0, filter_j), h - 1); for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){ int _filter_i = min(max(0, filter_i ), w - 1); TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ; } } float TR = 0.0f; for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BL = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BR = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float temp = 0.0f; temp += gamma * (TR - TL); temp += (1-gamma) * (BR - BL); bot_diff += gradoutput_value * temp; // for( int filter_j = iy2_T; filter_j< iy2_B; filter_j++){ // int _filter_j = min(max(0, filter_j) , h - 1); // for( int filter_i = ix2_L; filter_i< ix2_R; filter_i ++){ // int _filter_i = min(max(0,filter_i), w-1); // // bot_diff += // gradoutput_value * // input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * // input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i ] * //// exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size) * //// ((float) filter_i > x2 ? 1.0f : -1.0f) / (float)filter_size; // exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))) * // ((float) filter_i > x2 ? 1.0f : -1.0f); // } // } } //the gradients of the x direction/ horizontal direction gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff; /*** STEP 2.2: for the x/horizonotal direction. ***/ gamma = 1.0f - alpha; //ix2_R -x2; bot_diff = 0.0f; for(int c_i = 0 ; c_i < channel; c_i ++ ){ float gradoutput_value = gradoutput [ off + c_i * input1_c_stride + h_i * input1_h_stride +w_i]; float TL = 0.0f; for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){ int _filter_j = min(max(0, filter_j), h - 1); for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){ int _filter_i = min(max(0, filter_i ), w - 1); TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ; } } float TR = 0.0f; for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BL = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float BR = 0.0f; for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){ int _filter_j = min(max(0, filter_j),h - 1); // only used for input1 for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){ int _filter_i = min(max(0, filter_i),w - 1);// only used for input1 BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i]; } } float temp = 0.0f; temp += gamma * (BL - TL); temp += (1.0f - gamma) * ( BR - TR); bot_diff += gradoutput_value * temp; // for( int filter_j = iy2_T; filter_j < iy2_B; filter_j ++ ){ // int _filter_j = min(max(0, filter_j), h - 1); // for( int filter_i = ix2_L; filter_i < ix2_R; filter_i ++){ // int _filter_i = min(max(0, filter_i), w - 1); // // bot_diff += // gradoutput_value * // input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] * // input3 [batch_i * input3_b_stride +((filter_j - iy2_T) * filter_size + ( filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i ] * //// exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))/ (float)filter_size ) * //// ((float) filter_j > y2 ? 1.0f : - 1.0f ) / (float)filter_size; // exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ) * // ((float) filter_j > y2 ? 1.0f : - 1.0f ); // } // } } gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff; /*** STEP 3: calculate the gradients for input3, i.e. the filter ***/ // for(int c_i = 0 ; c_i >>( nElement, //to let the nummous w,h,channel,filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, input1.data(),input2.data(),input3.data(), output.data() ); })); // THCudaCheck(cudaGetLastError()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } int FilterInterpolationLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float)); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { FilterInterpolationLayer_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel,filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, input1.data(), input2.data(), input3.data(), gradoutput.data(), gradinput1.data(), gradinput2.data(), gradinput3.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } ================================================ FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cuh ================================================ #pragma once #include #include #include int FilterInterpolationLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output ); int FilterInterpolationLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ); ================================================ FILE: my_package/FilterInterpolation/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='filterinterpolation_cuda', ext_modules=[ CUDAExtension('filterinterpolation_cuda', [ 'filterinterpolation_cuda.cc', 'filterinterpolation_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/FlowProjection/FlowProjectionLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import flowprojection_cuda as my_lib #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class FlowProjectionLayer(Function): def __init__(self,requires_grad): super(FlowProjectionLayer,self).__init__() self.requires_grad = requires_grad @staticmethod def forward(ctx, input1, requires_grad): assert(input1.is_contiguous()) # assert(input2.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it fillhole = 1 if requires_grad == False else 0 # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections # output = torch.zeros(input1.size()) if input1.is_cuda : # output = output.cuda() # count = count.cuda() count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() err = my_lib.FlowProjectionLayer_gpu_forward(input1, count,output, fillhole) else: output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.FlowProjectionLayer_cpu_forward(input1, count, output, fillhole) if err != 0: print(err) # output = output/count # to divide the counter ctx.save_for_backward(input1, count) ctx.fillhole = fillhole # self.count = count #to keep this # print(self.input1[0, 0, :10, :10]) # print(self.count[0, 0, :10, :10]) # print(self.input1[0, 0, -10:, -10:]) # print(self.count[0, 0, -10:, -10:]) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Filter Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) input1, count, output = ctx.saved_tensors if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() err = my_lib.FlowProjectionLayer_gpu_backward(input1, count, gradoutput, gradinput1) # print(err) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_() err = my_lib.FlowProjectionLayer_cpu_backward(input1, count, gradoutput, gradinput1) # print(err) if err != 0: print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, None class FlowFillholelayer(Function): def __init__(self): super(FlowFillholelayer,self).__init__() def forward(self, input1): # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it if input1.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections output = torch.zeros(input1.size()) if input1.is_cuda : output = output.cuda() # count = count.cuda() err = my_lib.FlowFillholelayer_gpu_forward(input1, output) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.FlowFillholelayer_cpu_forward(input1, output) if err != 0: print(err) # output = output/count # to divide the counter # self.count = count #to keep this # print(self.input1[0, 0, :10, :10]) # print(self.count[0, 0, :10, :10]) # print(self.input1[0, 0, -10:, -10:]) # print(self.count[0, 0, -10:, -10:]) # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? # def backward(self, gradoutput): # # print("Backward of Filter Interpolation Layer") # # gradinput1 = input1.new().zero_() # # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) # if self.input1.is_cuda: # # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) # err = my_lib.FlowProjectionLayer_gpu_backward(self.input1, self.count, gradoutput, gradinput1) # # print(err) # if err != 0 : # print(err) # # else: # # print("CPU backward") # # print(gradoutput) # err = my_lib.FlowProjectionLayer_cpu_backward(self.input1, self.count, gradoutput, gradinput1) # # print(err) # if err != 0: # print(err) # # print(gradinput1) # # print(gradinput2) # # # print(gradinput1) # # return gradinput1 ================================================ FILE: my_package/FlowProjection/FlowProjectionModule.py ================================================ # modules/FlowProjectionModule.py from torch.nn import Module from .FlowProjectionLayer import FlowProjectionLayer #, FlowFillholeLayer class FlowProjectionModule(Module): def __init__(self, requires_grad = True): super(FlowProjectionModule, self).__init__() self.f = FlowProjectionLayer(requires_grad) def forward(self, input1): return self.f(input1) # class FlowFillholeModule(Module): # def __init__(self,hole_value = -10000.0): # super(FlowFillholeModule, self).__init__() # self.f = FlowFillholeLayer() # # def forward(self, input1): # return self.f(input1) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/FlowProjection/__init__.py ================================================ from .FlowProjectionModule import * ================================================ FILE: my_package/FlowProjection/flowprojection_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "flowprojection_cuda_kernel.cuh" int FlowProjectionLayer_gpu_forward( at::Tensor& input1, at::Tensor& count, at::Tensor& output, int fillhole ) { int error = 1 ; int channel = input1.size( 1); if(channel!= 2) return error; int batch = input1.size(0); int h = input1.size(2); int w = input1.size(3); int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); // printf("In gpu forward\n"); error = FlowProjection_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch,fillhole, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, count, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int FlowProjectionLayer_gpu_backward( at::Tensor& input1, at::Tensor& count, at::Tensor& gradoutput, at::Tensor& gradinput1 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=2) return error; int batch = input1.size(0); if(count.size(0) != batch) return error; if(count.size(1) != 1) return error; int h = input1.size(2); int w = input1.size(3); if(count.size(2) != h) return error;// to add some checkpoint if(count.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); // printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = FlowProjection_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, count, gradoutput, gradinput1 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("FlowProjectionLayer_gpu_forward", &FlowProjectionLayer_gpu_forward, "FlowProjection forward (CUDA)"); m.def("FlowProjectionLayer_gpu_backward", &FlowProjectionLayer_gpu_backward, "FlowProjection backward (CUDA)"); } ================================================ FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cu ================================================ #include #include "flowprojection_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void FlowProjection_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ]; float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ]; float x2 = (float) (w_i) + fx; float y2 = (float) (h_i) + fy; if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){ int ix2_L = (int) (x2); int iy2_T = (int) (y2); int ix2_R = min(ix2_L + 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,-fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-fx); atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-fx); atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] , -fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] , -fy); atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] , -fy); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] , 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , 1); atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] , 1); } } return ; } template __global__ void FlowProjectionAveraging_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ; if(temp > 0.0f){ output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp; output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp; } } return ; } template __global__ void FlowFillhole_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ; if(temp <= 0.0f){ //search along the four directions,0/90/180/270, until finding at least one int left_offset = w_i; float left_temp = 0.0f; while(left_temp == 0.0f && left_offset - 1 >= 0){ left_offset = left_offset - 1; left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ; } int right_offset = w_i ; float right_temp = 0.0f; while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){ right_offset = right_offset + 1 ; right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ; } int up_offset = h_i ; float up_temp = 0.0f; while(up_temp == 0.0f && up_offset - 1 >=0){ up_offset = up_offset - 1; up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ; } int down_offset = h_i; float down_temp = 0.0f; while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){ down_offset = down_offset + 1; down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ; } if(left_temp + right_temp + up_temp + down_temp <=0.0f){ //printf("Can't fill hole, find no neighbor vectors availabel\n"); return; } left_temp = (left_temp > 0.0f)?1:0; right_temp = (right_temp > 0.0f)?1:0; up_temp = (up_temp > 0.0f)?1:0; down_temp = (down_temp > 0.0f)?1:0; output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = ( left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =( left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; } } return ; } template __global__ void FlowProjection_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ count, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ; float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ; float x2 = (float) ( w_i ) + fx; float y2 = (float) ( h_i ) + fy; if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){ int ix2_L = (int)(x2); int iy2_T = (int)(y2); int ix2_R = min(ix2_L + 1, w-1); int iy2_B = min(iy2_T + 1, h-1); int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i; gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/ count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ; gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]/ count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ; gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/ count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ; gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/ count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ; int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/ count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]/ count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/ count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/ count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ; } } return ; } int FlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& count, at::Tensor& output ) { int error = 1 ; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // printf("I am here\n"); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_forward_kernelfunc", ([&] { FlowProjection_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),count.data(),output.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am there\n"); AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjectionAveraging_kernelfunc", ([&] { FlowProjectionAveraging_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),count.data(),output.data() ); })); // printf("I am kao\n"); // THCudaCheck(cudaGetLastError()); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am dd\n"); if(fillhole){ // printf("use flow fill hole\n"); AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowFillhole_kernelfunc", ([&] { FlowFillhole_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),count.data(),output.data() ); })); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); return error; } } error = 0; return error; } int FlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& count, at::Tensor& gradoutput, at::Tensor& gradinput1 ) { int error = 1 ; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_backward_kernelfunc", ([&] { FlowProjection_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(), count.data(), gradoutput.data(), gradinput1.data() ); })); // printf("gpu I am there\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("gpu I am here\n"); error = 0; return error; } ================================================ FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cuh ================================================ #pragma once #include #include #include int FlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& count, at::Tensor& output ); int FlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& count, at::Tensor& gradoutput, at::Tensor& gradinput1 ); ================================================ FILE: my_package/FlowProjection/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='flowprojection_cuda', ext_modules=[ CUDAExtension('flowprojection_cuda', [ 'flowprojection_cuda.cc', 'flowprojection_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/Interpolation/InterpolationLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import interpolation_cuda as my_lib #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class InterpolationLayer(Function): def __init__(self): super(InterpolationLayer,self).__init__() @staticmethod def forward(ctx, input1,input2): assert(input1.is_contiguous()) assert(input2.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy? # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # output = torch.zeros(input1.size()) if input1.is_cuda : # output = output.cuda() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() my_lib.InterpolationLayer_gpu_forward(input1, input2, output) else: output = torch.cuda.FloatTensor(input1.data.size()) my_lib.InterpolationLayer_cpu_forward(input1, input2, output) ctx.save_for_backward(input1, input2) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) # gradinput2 = torch.zeros(self.input2.size()) input1, input2 = ctx.saved_tensors if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) # gradinput2 = gradinput2.cuda(self.device) gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_() # the input1 image should not require any gradients # print("Does input1 requires gradients? " + str(self.input1.requires_grad)) err = my_lib.InterpolationLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_() err = my_lib.InterpolationLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, gradinput2 ================================================ FILE: my_package/Interpolation/InterpolationModule.py ================================================ # modules/InterpolationLayer.py from torch.nn import Module from .InterpolationLayer import InterpolationLayer class InterpolationModule(Module): def __init__(self): super(InterpolationModule, self).__init__() # self.f = InterpolationLayer() def forward(self, input1, input2): return InterpolationLayer.apply(input1, input2) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/Interpolation/__init__.py ================================================ from .InterpolationModule import * ================================================ FILE: my_package/Interpolation/interpolation_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "interpolation_cuda_kernel.cuh" int InterpolationLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& output ) { int error = 1 ; int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); error =InterpolationLayer_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), nElement,w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1, input2, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int InterpolationLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input2_b_stride != gradinput2.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; if(input2_c_stride != gradinput2.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = InterpolationLayer_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1, input2, gradoutput, gradinput1, gradinput2 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("InterpolationLayer_gpu_forward", &InterpolationLayer_gpu_forward, "Interpolation forward (CUDA)"); m.def("InterpolationLayer_gpu_backward", &InterpolationLayer_gpu_backward, "Interpolation backward (CUDA)"); } ================================================ FILE: my_package/Interpolation/interpolation_cuda_kernel.cu ================================================ #include #include "interpolation_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void InterpolationLayer_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ]; float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ]; float x2 = (float)(w_i) + fx; float y2 = (float)(h_i) + fy; if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){ int ix2_L = int(x2); int iy2_T = int(y2); int ix2_R = min(ix2_L + 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float alpha = x2 - ix2_L; float beta = y2 - iy2_T; for(int c_i = 0 ; c_i < channel ; c_i ++){ float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]; float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R]; float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]; float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R]; output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = (1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR; } } else{ //the warping data is out of range, we fill it with zeros for(int c_i = 0 ; c_i < channel; c_i ++){ output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue; } } } return ; } template __global__ void InterpolationLayer_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1, scalar_t* gradinput2 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ]; float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i]; float x2 = float(w_i) + fx; float y2 = float(h_i) + fy; if(x2 >= 0.0f && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){ int ix2_L = int(x2); int iy2_T = int(y2); int ix2_R = min(ix2_L+ 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float alpha = x2 - ix2_L; float beta = y2 - iy2_T; for (int c_i = 0 ; c_i < channel; c_i++){ float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta)); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta)); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta); } float gamma = iy2_B - y2; float bot_diff = 0.0f; for(int c_i =0 ; c_i< channel; c_i ++ ){ float temp = 0; temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] - input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]); temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] - input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]); float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i]; bot_diff += warped_diff_value * temp ; } //the gradients of the x direction/ horizontal direction gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff; gamma = ix2_R- x2; bot_diff = 0.0f; for(int c_i = 0 ; c_i < channel;c_i ++ ){ float temp = 0.0f; temp += gamma * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] - input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]); temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] - input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]); float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; bot_diff += warped_diff_value * temp; } gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff; } } return ; } int InterpolationLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& output ) { int error = -1; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] { InterpolationLayer_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1.data(),input2.data(),output.data() ); })); // THCudaCheck(cudaGetLastError()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } int InterpolationLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = -1; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] { InterpolationLayer_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1.data(), input2.data(), gradoutput.data(), gradinput1.data(), gradinput2.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } ================================================ FILE: my_package/Interpolation/interpolation_cuda_kernel.cuh ================================================ #pragma once #include #include #include int InterpolationLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& output ); int InterpolationLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ); ================================================ FILE: my_package/Interpolation/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='interpolation_cuda', ext_modules=[ CUDAExtension('interpolation_cuda', [ 'interpolation_cuda.cc', 'interpolation_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/InterpolationCh/InterpolationChLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import interpolationch_cuda as my_lib #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class InterpolationChLayer(Function): def __init__(self,ch): super(InterpolationChLayer,self).__init__() self.ch = ch @staticmethod def forward(ctx, input1,input2): assert(input1.is_contiguous()) assert(input2.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy? # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # output = torch.zeros(input1.size()) if input1.is_cuda : # output = output.cuda() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() my_lib.InterpolationChLayer_gpu_forward(input1, input2, output) else: # output = torch.cuda.FloatTensor(input1.data.size()) output = torch.FloatTensor().resize_(input1.size()).zero_() my_lib.InterpolationChLayer_cpu_forward(input1, input2, output) ctx.save_for_backward(input1, input2) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) # gradinput2 = torch.zeros(self.input2.size()) input1, input2 = ctx.saved_tensors if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) # gradinput2 = gradinput2.cuda(self.device) gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_() # the input1 image should not require any gradients # print("Does input1 requires gradients? " + str(self.input1.requires_grad)) err = my_lib.InterpolationChLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_() err = my_lib.InterpolationChLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, gradinput2 ================================================ FILE: my_package/InterpolationCh/InterpolationChModule.py ================================================ # modules/InterpolationLayer.py from torch.nn import Module from .InterpolationChLayer import InterpolationChLayer class InterpolationChModule(Module): def __init__(self,ch): super(InterpolationChModule, self).__init__() self.ch = ch # self.f = InterpolationChLayer(ch) def forward(self, input1, input2): return InterpolationChLayer.apply(input1, input2) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/InterpolationCh/__init__.py ================================================ from .InterpolationChModule import * ================================================ FILE: my_package/InterpolationCh/interpolationch_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "interpolationch_cuda_kernel.cuh" int InterpolationChLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& output ) { int error = 1 ; int channel = input1.size( 1); // if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); error =InterpolationChLayer_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1, input2, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int InterpolationChLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = 1 ; int channel = input1.size( 1); // if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != 2) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h) return error;// to add some checkpoint if(input2.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input2_b_stride != gradinput2.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; if(input2_c_stride != gradinput2.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = InterpolationChLayer_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1, input2, gradoutput, gradinput1, gradinput2 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("InterpolationChLayer_gpu_forward", &InterpolationChLayer_gpu_forward, "InterpolationCh forward (CUDA)"); m.def("InterpolationChLayer_gpu_backward", &InterpolationChLayer_gpu_backward, "InterpolationCh backward (CUDA)"); } ================================================ FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cu ================================================ #include #include "interpolationch_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void InterpolationChLayer_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ]; float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ]; float x2 = (float)(w_i) + fx; float y2 = (float)(h_i) + fy; if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){ int ix2_L = int(x2); int iy2_T = int(y2); int ix2_R = min(ix2_L + 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float alpha = x2 - ix2_L; float beta = y2 - iy2_T; for(int c_i = 0 ; c_i < channel ; c_i ++){ float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]; float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R]; float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]; float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R]; output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = (1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR; } } else{ //the warping data is out of range, we fill it with zeros for(int c_i = 0 ; c_i < channel; c_i ++){ output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue; } } } return ; } template __global__ void InterpolationChLayer_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1, scalar_t* gradinput2 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ]; float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i]; float x2 = float(w_i) + fx; float y2 = float(h_i) + fy; if(x2 >= 0.0f && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){ int ix2_L = int(x2); int iy2_T = int(y2); int ix2_R = min(ix2_L+ 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float alpha = x2 - ix2_L; float beta = y2 - iy2_T; for (int c_i = 0 ; c_i < channel; c_i++){ float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta)); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta)); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta); atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta); } float gamma = iy2_B - y2; float bot_diff = 0.0f; for(int c_i =0 ; c_i< channel; c_i ++ ){ float temp = 0; temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] - input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]); temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] - input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]); float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i]; bot_diff += warped_diff_value * temp ; } //the gradients of the x direction/ horizontal direction gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff; gamma = ix2_R- x2; bot_diff = 0.0f; for(int c_i = 0 ; c_i < channel;c_i ++ ){ float temp = 0.0f; temp += gamma * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] - input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]); temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] - input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]); float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i]; bot_diff += warped_diff_value * temp; } gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff; } } return ; } int InterpolationChLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& output ) { int error = 1 ; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_forward_kernelfunc", ([&] { InterpolationChLayer_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1.data(),input2.data(),output.data() ); })); // THCudaCheck(cudaGetLastError()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } int InterpolationChLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = 1 ; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_backward_kernelfunc", ([&] { InterpolationChLayer_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input1.data(), input2.data(), gradoutput.data(), gradinput1.data(), gradinput2.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } ================================================ FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cuh ================================================ #pragma once #include #include #include int InterpolationChLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& output ); int InterpolationChLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ); ================================================ FILE: my_package/InterpolationCh/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='interpolationch_cuda', ext_modules=[ CUDAExtension('interpolationch_cuda', [ 'interpolationch_cuda.cc', 'interpolationch_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/MinDepthFlowProjection/__init__.py ================================================ from .minDepthFlowProjectionModule import * ================================================ FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function #import _ext.my_lib as my_lib import mindepthflowprojection_cuda as my_lib class minDepthFlowProjectionLayer(Function): def __init__(self,requires_grad): super(minDepthFlowProjectionLayer,self).__init__() # self.requires_grad = requires_grad @staticmethod def forward(ctx, input1, input2, requires_grad): # print("Depth Aware Flow Projection") assert(input1.is_contiguous()) assert(input2.is_contiguous()) # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it # self.input2 = input2.contiguous() fillhole = 1 if requires_grad == False else 0 # if input1.is_cuda: # self.device = torch.cuda.current_device() # else: # self.device = -1 # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections # output = torch.zeros(input1.size()) if input1.is_cuda: # output = output.cuda() # count = count.cuda() # print("correct") count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_() output = torch.cuda.FloatTensor().resize_(input1.size()).zero_() err = my_lib.minDepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole) else: # output = torch.cuda.FloatTensor(input1.data.size()) count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_() output = torch.FloatTensor().resize_(input1.size()).zero_() err = my_lib.minDepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole) if err != 0: print(err) # output = output/count # to divide the counter # self.count = count #to keep this # self.output = output ctx.save_for_backward(input1, input2,count,output) ctx.fillhole = fillhole # print(self.input1[0, 0, :10, :10]) # print(self.count[0, 0, :10, :10]) # print(self.input1[0, 0, -10:, -10:]) # print(self.count[0, 0, -10:, -10:]) # the function returns the output to its caller return output @staticmethod def backward(ctx, gradoutput): # print("Backward of Filter Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() # gradinput1 = torch.zeros(self.input1.size()) input1, input2, count, output = ctx.saved_tensors # fillhole = ctx.fillhole if input1.is_cuda: # print("CUDA backward") # gradinput1 = gradinput1.cuda(self.device) gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_() err = my_lib.minDepthFlowProjectionLayer_gpu_backward(input1,input2, count, output, gradoutput, gradinput1,gradinput2) # print(err) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_() gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_() err = my_lib.minDepthFlowProjectionLayer_cpu_backward(input1, input2, count, output, gradoutput, gradinput1,gradinput2) # print(err) if err != 0: print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1,gradinput2,None ================================================ FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionModule.py ================================================ # modules/FlowProjectionModule.py from torch.nn.modules.module import Module from .minDepthFlowProjectionLayer import minDepthFlowProjectionLayer #, FlowFillholeLayer __all__ =['minDepthFlowProjectionModule'] class minDepthFlowProjectionModule(Module): def __init__(self, requires_grad = True): super(minDepthFlowProjectionModule, self).__init__() self.requires_grad = requires_grad # self.f = minDepthFlowProjectionLayer(requires_grad) def forward(self, input1, input2): return minDepthFlowProjectionLayer.apply(input1, input2,self.requires_grad) # class FlowFillholeModule(Module): # def __init__(self,hole_value = -10000.0): # super(FlowFillholeModule, self).__init__() # self.f = FlowFillholeLayer() # # def forward(self, input1): # return self.f(input1) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "mindepthflowprojection_cuda_kernel.cuh" int minDepthFlowProjectionLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, int fillhole ) { int error = 1 ; int channel = input1.size( 1); if(channel!= 2) return error; int batch = input1.size(0); int h = input1.size(2); int w = input1.size(3); if(input2.size(1) !=1 ) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != output.stride(0)) return error; if(input1_c_stride != output.stride(1)) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); // printf("In gpu forward\n"); error = minDepthFlowProjection_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch,fillhole, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, input2, count, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int minDepthFlowProjectionLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=2) return error; int batch = input1.size(0); if(count.size( 0) != batch) return error; if(count.size(1) != 1) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(1) !=1 ) return error; if(count.size(2) != h) return error;// to add some checkpoint if(count.size(3) != w) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int count_b_stride = count.stride(0); int count_c_stride = count.stride(1); int count_h_stride = count.stride(2); int count_w_stride = count.stride(3); //TODO: do we need to assert the w_stride to be 1 //if(w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); // printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = minDepthFlowProjection_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1, input2, count, output, gradoutput, gradinput1, gradinput2 ); if (error) {AT_ERROR("CUDA call failed");} //printf("Am I good in backward function %d",error); return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("minDepthFlowProjectionLayer_gpu_forward", &minDepthFlowProjectionLayer_gpu_forward, "minDepthFlowProjection forward (CUDA)"); m.def("minDepthFlowProjectionLayer_gpu_backward", &minDepthFlowProjectionLayer_gpu_backward, "minDepthFlowProjection backward (CUDA)"); } ================================================ FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cu ================================================ #include #include "mindepthflowprojection_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void minDepthFlowProjection_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ]; float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ]; float x2 = (float) (w_i) + fx; float y2 = (float) (h_i) + fy; if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){ int ix2_L = (int) (x2); int iy2_T = (int) (y2); int ix2_R = min(ix2_L + 1, w - 1); int iy2_B = min(iy2_T + 1, h - 1); float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i]; float old_exist = 0; //while(1){ old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L]; if(temp > old_exist){ output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] = - fx; //update the new vector output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] = - fy; count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] = temp; // update to the best weight //if ( count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] == temp){ //break; //} } //} // old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ; // if(temp > old_exist){ // output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]= - fx; // output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] = - fy; // count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]= temp ; // } // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]; // if(temp > old_exist){ // output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] = - fx; // output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] = - fy; // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] = temp; // } // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R]; // if(temp> old_exist){ // output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] = - fx; // output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] = - fy; // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] = temp; // } } } return ; } template __global__ void minDepthFlowFillhole_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ; if(temp <= 0.0f){ //search along the four directions,0/90/180/270, until finding at least one int left_offset = w_i; float left_temp = 0.0f; while(left_temp == 0.0f && left_offset - 1 >= 0){ left_offset = left_offset - 1; left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ; } int right_offset = w_i ; float right_temp = 0.0f; while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){ right_offset = right_offset + 1 ; right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ; } int up_offset = h_i ; float up_temp = 0.0f; while(up_temp == 0.0f && up_offset - 1 >=0){ up_offset = up_offset - 1; up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ; } int down_offset = h_i; float down_temp = 0.0f; while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){ down_offset = down_offset + 1; down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ; } if(left_temp + right_temp + up_temp + down_temp <=0.0f){ //printf("Can't fill hole, find no neighbor vectors availabel\n"); return; } left_temp = (left_temp > 0.0f)?1:0; right_temp = (right_temp > 0.0f)?1:0; up_temp = (up_temp > 0.0f)?1:0; down_temp = (down_temp > 0.0f)?1:0; output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = ( left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =( left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] + right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+ up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] + down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i] )/( left_temp + right_temp + up_temp + down_temp ) ; } } return ; } template __global__ void minDepthFlowProjection_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, scalar_t* count, scalar_t* output, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1, scalar_t* gradinput2 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w; const bool withinYbounds = h_i < h; const int batch_i = blockIdx.z; const int off = batch_i * input1_b_stride; // __syncthreads(); if(withinXbounds && withinYbounds){ float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ; float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ; float x2 = (float) ( w_i ) + fx; float y2 = (float) ( h_i ) + fy; if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){ int ix2_L = (int)(x2); int iy2_T = (int)(y2); int ix2_R = min(ix2_L + 1, w-1); int iy2_B = min(iy2_T + 1, h-1); float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i]; int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i; int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i; if(temp == count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ){ gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] ; } if(temp == count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ){ gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] ; } if(temp==count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ){ gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ; } if(temp == count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ){ gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] ; gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]; } //int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i; //gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] / // count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] * // (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ); //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] / // count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] * // (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ); //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] / // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] * // (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ); //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] / // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] * // (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] ); //gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] / // count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] * // (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ); //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] / // count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] * // (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ); //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] / // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] * // (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ); //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] / // count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] * // (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] ); } } return ; } int minDepthFlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output ) { int error = -1; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // printf("I am here\n"); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_forward", ([&] { minDepthFlowProjection_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am there\n"); // THCudaCheck(cudaGetLastError()); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("I am dd\n"); if(fillhole){ // printf("use flow fill hole\n"); AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowFillhole", ([&] { minDepthFlowFillhole_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data() ); })); err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); return error; } } error = 0; return error; } int minDepthFlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ) { int error = -1; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_backward", ([&] { minDepthFlowProjection_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, count_b_stride,count_c_stride,count_h_stride,count_w_stride, input1.data(),input2.data(),count.data(),output.data(), gradoutput.data(), gradinput1.data(), gradinput2.data() ); })); // printf("gpu I am there\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } // printf("gpu I am here\n"); error = 0; return error; } ================================================ FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cuh ================================================ #pragma once #include #include #include int minDepthFlowProjection_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int fillhole, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output ); int minDepthFlowProjection_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& count, at::Tensor& output, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2 ); ================================================ FILE: my_package/MinDepthFlowProjection/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='mindepthflowprojection_cuda', ext_modules=[ CUDAExtension('mindepthflowprojection_cuda', [ 'mindepthflowprojection_cuda.cc', 'mindepthflowprojection_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/SeparableConv/SeparableConvLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import _ext.my_lib as my_lib #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class SeparableConvLayer(Function): def __init__(self,filtersize): self.filtersize = filtersize super(SeparableConvLayer,self).__init__() def forward(self, input1,input2,input3): intBatches = input1.size(0) intInputDepth = input1.size(1) intInputHeight = input1.size(2) intInputWidth = input1.size(3) intFilterSize = min(input2.size(1), input3.size(1)) intOutputHeight = min(input2.size(2), input3.size(2)) intOutputWidth = min(input2.size(3), input3.size(3)) assert(intInputHeight - self.filtersize == intOutputHeight - 1) assert(intInputWidth - self.filtersize == intOutputWidth - 1) assert(intFilterSize == self.filtersize) assert(input1.is_contiguous() == True) assert(input2.is_contiguous() == True) assert(input3.is_contiguous() == True) output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_() # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy? self.input3 = input3.contiguous() if input1.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 if input1.is_cuda : output = output.cuda() err = my_lib.SeparableConvLayer_gpu_forward(input1, input2,input3, output) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.SeparableConvLayer_cpu_forward(input1, input2,input3, output) if err != 0: print(err) # the function returns the output to its caller return output #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): # print("Backward of Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() gradinput1 = torch.zeros(self.input1.size()) gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) if self.input1.is_cuda: # print("CUDA backward") gradinput1 = gradinput1.cuda(self.device) gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) # the input1 image should not require any gradients # print("Does input1 requires gradients? " + str(self.input1.requires_grad)) err = my_lib.SeparableConvLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) err = my_lib.SeparableConvLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3) # print(err) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, gradinput2,gradinput3 ================================================ FILE: my_package/SeparableConv/SeparableConvModule.py ================================================ # modules/InterpolationLayer.py from torch.nn import Module from functions.SeparableConvLayer import SeparableConvLayer class SeparableConvModule(Module): def __init__(self,filtersize): super(SeparableConvModule, self).__init__() self.f = SeparableConvLayer(filtersize) def forward(self, input1, input2, input3): return self.f(input1, input2, input3) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/SeparableConv/__init__.py ================================================ from .SeparableConvModule import * ================================================ FILE: my_package/SeparableConv/separableconv_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "separableconv_cuda_kernel.cuh" int SeparableConvLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output ) { int error = 1 ; int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != input3.size(1)) return error; //change by zhenghe, am I right? int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint if(input2.size(3) != w - input2.size(1) + 1) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); int output_b_stride = output.stride(0); int output_c_stride = output.stride(1); int output_h_stride = output.stride(2); int output_w_stride = output.stride(3); // printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; if(output_w_stride !=1) return error; if(input2_b_stride != input3_b_stride) return error; if(input2_c_stride != input3_c_stride) return error; int nElement = 0;//UNUSED THCudaTensor_nElement(state, output); error = SeparableConvLayer_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch, input2.size(1), input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, output_b_stride,output_c_stride,output_h_stride,output_w_stride, input1, input2, input3, output); if (error) {AT_ERROR("CUDA call failed");} return error; } int SeparableConvLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != input2.size(1)) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint if(input2.size(3) != w - input2.size(1) + 1) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); int output_b_stride = gradoutput.stride(0); int output_c_stride = gradoutput.stride(1); int output_h_stride = gradoutput.stride(2); int output_w_stride = gradoutput.stride(3); // printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; if(output_w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input2_b_stride != gradinput2.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; if(input2_c_stride != gradinput2.stride(1)) return error; if(input3_c_stride != gradinput3.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput); error = SeparableConvLayer_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input2.size(1), input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, output_b_stride,output_c_stride,output_h_stride,output_w_stride, input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("SeparableConvLayer_gpu_forward", &SeparableConvLayer_gpu_forward, "SeparableConv forward (CUDA)"); m.def("SeparableConvLayer_gpu_backward", &SeparableConvLayer_gpu_backward, "SeparableConv backward (CUDA)"); } ================================================ FILE: my_package/SeparableConv/separableconv_cuda_kernel.cu ================================================ #include #include "separableconv_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void SeparableConvLayer_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, scalar_t* output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w - filter_size + 1; const bool withinYbounds = h_i < h - filter_size + 1; const int batch_i = blockIdx.z; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { for ( int c_i = 0 ; c_i < channel ; c_i ++){ float out = 0.0f; for (int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) { for (int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) { float temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)]; float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ]; float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ]; out += temp1* temp2 * temp3; } } output[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ] = out; } } return ; } template __global__ void SeparableConvLayer_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, const scalar_t* __restrict__ gradoutput, scalar_t* gradinput1, scalar_t* gradinput2, scalar_t* gradinput3 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w - filter_size + 1; const bool withinYbounds = h_i < h - filter_size + 1; const int batch_i = blockIdx.z; if(withinXbounds && withinYbounds){ for (int c_i = 0 ; c_i < channel ; c_i ++){ for (int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) { for ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) { float temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)]; float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ]; float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ]; float gradout = gradoutput[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ]; atomicAdd(&gradinput1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)], gradout * temp2 * temp3); atomicAdd(&gradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ], gradout * temp1 * temp3); atomicAdd(&gradinput3 [batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] , gradout * temp1 * temp2); } } } } return ; } int SeparableConvLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch,const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output ) { int error = 1 ; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { SeparableConvLayer_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, output_b_stride,output_c_stride,output_h_stride,output_w_stride, input1.data(),input2.data(),input3.data(), output.data() ); })); // THCudaCheck(cudaGetLastError()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } int SeparableConvLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float)); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { SeparableConvLayer_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, output_b_stride,output_c_stride,output_h_stride,output_w_stride, input1.data(), input2.data(), input3.data(), gradoutput.data(), gradinput1.data(), gradinput2.data(), gradinput3.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } ================================================ FILE: my_package/SeparableConv/separableconv_cuda_kernel.cuh ================================================ #pragma once #include #include #include int SeparableConvLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output ); int SeparableConvLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ); ================================================ FILE: my_package/SeparableConv/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='separableconv_cuda', ext_modules=[ CUDAExtension('separableconv_cuda', [ 'separableconv_cuda.cc', 'separableconv_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/SeparableConvFlow/SeparableConvFlowLayer.py ================================================ # this is for wrapping the customized layer import torch from torch.autograd import Function import separableconvflow_cuda as my_lib import warnings #Please check how the STN FUNCTION is written : #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py #https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py class SeparableConvFlowLayer(Function): def __init__(self,filtersize): self.filtersize = filtersize warnings.warn("\nSeparable Conv Flow Layer is not precise enough for optical flow due to a divison operation") super(SeparableConvFlowLayer,self).__init__() def forward(self, input1,input2,input3): intBatches = input1.size(0) intInputDepth = input1.size(1) intInputHeight = input1.size(2) intInputWidth = input1.size(3) intFilterSize = min(input2.size(1), input3.size(1)) intOutputHeight = min(input2.size(2), input3.size(2)) intOutputWidth = min(input2.size(3), input3.size(3)) assert(intInputHeight - self.filtersize == intOutputHeight - 1) assert(intInputWidth - self.filtersize == intOutputWidth - 1) assert(intFilterSize == self.filtersize) assert(input1.is_contiguous() == True) assert(input2.is_contiguous() == True) assert(input3.is_contiguous() == True) # output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_() flow_ouput = torch.zeros(intBatches, 2,intOutputHeight, intOutputWidth) # as a byproduct of SepConv, but no # assert(input1.is_contiguous()) # assert(input2.is_contiguous()) self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy? self.input3 = input3.contiguous() if input1.is_cuda: self.device = torch.cuda.current_device() else: self.device = -1 if input1.is_cuda : # output = output.cuda() flow_ouput = flow_ouput.cuda() err = my_lib.SeparableConvFlowLayer_gpu_forward(input1, input2,input3,flow_ouput) else: # output = torch.cuda.FloatTensor(input1.data.size()) err = my_lib.SeparableConvFlowLayer_cpu_forward(input1, input2,input3,flow_ouput) if err != 0: print(err) # the function returns the output to its caller return flow_ouput #TODO: if there are multiple outputs of this function, then the order should be well considered? def backward(self, gradoutput): # print("Backward of Interpolation Layer") # gradinput1 = input1.new().zero_() # gradinput2 = input2.new().zero_() gradinput1 = torch.zeros(self.input1.size()) # the input1 has zero gradient because flow backprop. nothing to gradinput1 gradinput2 = torch.zeros(self.input2.size()) gradinput3 = torch.zeros(self.input3.size()) if self.input1.is_cuda: # print("CUDA backward") gradinput1 = gradinput1.cuda(self.device) gradinput2 = gradinput2.cuda(self.device) gradinput3 = gradinput3.cuda(self.device) # the input1 image should not require any gradients # print("Does input1 requires gradients? " + str(self.input1.requires_grad)) # err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3) err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3) if err != 0 : print(err) else: # print("CPU backward") # print(gradoutput) # print(err) # err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3) err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3) if err != 0 : print(err) # print(gradinput1) # print(gradinput2) # print(gradinput1) return gradinput1, gradinput2,gradinput3 ================================================ FILE: my_package/SeparableConvFlow/SeparableConvFlowModule.py ================================================ # modules/InterpolationLayer.py from torch.nn import Module from .SeparableConvFlowLayer import SeparableConvFlowLayer import torch class SeparableConvFlowModule(Module): def __init__(self,filtersize): super(SeparableConvFlowModule, self).__init__() self.f = SeparableConvFlowLayer(filtersize) def forward(self, input1, input2, input3): # temp2 = torch.div(input2, torch.sum(input2,dim=1,keepdim=True)) return self.f(input1, input2, input3) #we actually dont need to write the backward code for a module, since we have ================================================ FILE: my_package/SeparableConvFlow/__init__.py ================================================ from .SeparableConvFlowModule import * ================================================ FILE: my_package/SeparableConvFlow/separableconvflow_cuda.cc ================================================ #include #include #include #include #include //works for 1.0.0 #include "separableconvflow_cuda_kernel.cuh" int SeparableConvFlowLayer_gpu_forward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, //at::Tensor& output, at::Tensor& flow_output ) { int error = 1 ; //int point =0 ;printf("debug point %d\n", point++ ); int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size(0) != batch) return error; if(input2.size(1) != input2.size(1)) return error; //printf("debug point %d\n", point++ ); int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint if(input2.size(3) != w - input2.size(1) + 1) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); //int output_b_stride = output.stride(0); //int output_c_stride = output.stride(1); //int output_h_stride = output.stride(2); //int output_w_stride = output.stride(3); int flow_output_b_stride = flow_output.stride(0); int flow_output_c_stride = flow_output.stride(1); int flow_output_h_stride = flow_output.stride(2); int flow_output_w_stride = flow_output.stride(3); //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; // if(output_w_stride !=1) return error; if(flow_output_w_stride !=1) return error; if(input2_b_stride != input3_b_stride) return error; if(input2_c_stride != input3_c_stride) return error; //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); int nElement = 0;//UNUSED 0;//UNUSED THCudaTensor_nElement(state, flow_output); error = SeparableConvFlowLayer_gpu_forward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement,w,h,channel,batch, input2.size(1), input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, // output_b_stride,output_c_stride,output_h_stride,output_w_stride, flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride, input1, input2, input3, //output , flow_output ); if (error) {AT_ERROR("CUDA call failed");} //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); return error; } int SeparableConvFlowLayer_gpu_backward( at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradflow_output, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; int channel = input1.size( 1); if(channel!=3) return error; int batch = input1.size(0); if(input2.size( 0) != batch) return error; if(input2.size(1) != input2.size(1)) return error; int h = input1.size(2); int w = input1.size(3); if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint if(input2.size(3) != w - input2.size(1) + 1) return error; int input1_b_stride = input1.stride(0); int input1_c_stride = input1.stride(1); int input1_h_stride = input1.stride(2); int input1_w_stride = input1.stride(3); int input2_b_stride = input2.stride(0); int input2_c_stride = input2.stride(1); int input2_h_stride = input2.stride(2); int input2_w_stride = input2.stride(3); int input3_b_stride = input3.stride(0); int input3_c_stride = input3.stride(1); int input3_h_stride = input3.stride(2); int input3_w_stride = input3.stride(3); //int output_b_stride = gradoutput.stride(0); //int output_c_stride = gradoutput.stride(1); //int output_h_stride = gradoutput.stride(2); //int output_w_stride = gradoutput.stride(3); int flow_output_b_stride = gradflow_output.stride(0); int flow_output_c_stride = gradflow_output.stride(1); int flow_output_h_stride = gradflow_output.stride(2); int flow_output_w_stride = gradflow_output.stride(3); // printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride); //TODO: do we need to assert the w_stride to be 1 if(input1_w_stride !=1) return error; if(input2_w_stride !=1) return error; if(input3_w_stride !=1) return error; // if(output_w_stride !=1) return error; if(flow_output_w_stride !=1) return error; if(input1_b_stride != gradinput1.stride(0)) return error; if(input2_b_stride != gradinput2.stride(0)) return error; if(input1_c_stride != gradinput1.stride(1)) return error; if(input2_c_stride != gradinput2.stride(1)) return error; if(input3_c_stride != gradinput3.stride(1)) return error; // printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride); int nElement = 0;//UNUSED 0;//UNUSED THCudaTensor_nElement(state, gradflow_output); error = SeparableConvFlowLayer_gpu_backward_kernel( // at::globalContext().getCurrentCUDAStream(), //works for 0.4.1 at::cuda::getCurrentCUDAStream(), //works for 1.0.0 nElement, //to let the nummous w,h,channel,batch, input2.size(1), input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, // output_b_stride,output_c_stride,output_h_stride,output_w_stride, flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride, input1, input2, input3, gradflow_output, gradinput1, gradinput2, gradinput3 ); if (error) {AT_ERROR("CUDA call failed");} return error; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("SeparableConvFlowLayer_gpu_forward", &SeparableConvFlowLayer_gpu_forward, "SeparableConvFlow forward (CUDA)"); m.def("SeparableConvFlowLayer_gpu_backward", &SeparableConvFlowLayer_gpu_backward, "SeparableConvFlow backward (CUDA)"); } ================================================ FILE: my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cu ================================================ #include #include "separableconvflow_cuda_kernel.cuh" #include #include #include #include #define min(a,b) ((ab)?(a):(b)) #define DEBUG (0) #ifndef BLOCKDIMX #define BLOCKDIMX (32) #endif #ifndef BLOCKDIMY #define BLOCKDIMY (16) #endif using at::Half; //forward path of our layer template __global__ void SeparableConvFlowLayer_gpu_forward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, //const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, scalar_t* flow_output ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used //only use one dimensioon of the grid and block const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w - filter_size + 1; const bool withinYbounds = h_i < h - filter_size + 1; const int batch_i = blockIdx.z; // __syncthreads(); // const float fillvalue =0.0f; if( withinXbounds && withinYbounds) { float flow_y = 0.0f; float sum_weights = 0.0f; for ( int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) { float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ]; flow_y += (float)(intFilterY) * temp2 ; sum_weights += temp2; } //sum_weights = fabs(sum_weights); flow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0; flow_output[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = fabs(sum_weights) > 0.0f ? flow_y : -2000; float flow_x = 0.0f; float sum_weights_x = 0.0f; for ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) { float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ]; flow_x += (float)(intFilterX) * temp3; sum_weights_x += temp3; } //sum_weights_x = fabs(sum_weights_x); flow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0; // what if the sum_weight is less than zeros. flow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] = fabs(sum_weights_x) >0.0f ? flow_x : -2000; } return ; } template __global__ void SeparableConvFlowLayer_gpu_backward_kernelfunc( const int nElement, const int w, const int h, const int channel, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, //const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, const scalar_t* __restrict__ gradflow_output, scalar_t* gradinput1, scalar_t* gradinput2, scalar_t* gradinput3 ) { //blockIdx.z : batch index from 0~B-1 //blockIdx.y : height patch index from ceil(h/16) //blockIdx.x : width patch index from ceil(w/32) //threadidx.x: width index 0~31 //threadIdx.y: height index 0~15 //threadIdx.z: Not used const int w_i = blockIdx.x * blockDim.x + threadIdx.x; const int h_i = blockIdx.y * blockDim.y + threadIdx.y; const bool withinXbounds = w_i < w - filter_size + 1; const bool withinYbounds = h_i < h - filter_size + 1; const int batch_i = blockIdx.z; if(withinXbounds && withinYbounds){ float flow_y = 0.0f; float sum_weights = 0.0f; for ( int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) { float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ]; flow_y += (float)(intFilterY) * temp2 ; sum_weights += temp2; } //flow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0; //flow_output_data[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = // sum_weights >0.0f ? flow_y : -2000; //float sign = sum_weights >0.0f ? 1.0f : -1.0f; //sum_weights = fabs(sum_weights); if(fabs(sum_weights) >0.0f ){ float gradflow_y = gradflow_output[batch_i * flow_output_b_stride + 1* flow_output_c_stride + h_i * flow_output_h_stride + w_i ] ; float offset = flow_y / ( sum_weights * sum_weights); for (int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) { gradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ] = gradflow_y * ((float)(intFilterY) / sum_weights - offset); } } float flow_x = 0.0f; float sum_weights_x = 0.0f; for ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) { float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ]; flow_x += (float)(intFilterX) * temp3; sum_weights_x += temp3; } //flow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0; //flow_output_data[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] = // sum_weights_x >0 ? flow_x : -2000; //float sign_x = sum_weights_x >0.0f ? 1.0f : -1.0f; //sum_weights_x = fabs(sum_weights_x); if(fabs(sum_weights_x) > 0.0f ){ float gradflow_x = gradflow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i * flow_output_h_stride + w_i]; float offset = flow_x / (sum_weights_x * sum_weights_x); for ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) { gradinput3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] += gradflow_x * ((float)(intFilterX) /sum_weights_x - offset); } } } return ; } int SeparableConvFlowLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch,const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, //const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& flow_output ) { int error = 1 ; dim3 grid; dim3 block; // blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); //extract the data of CudaTensor and use kernel to calculate. AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { SeparableConvFlowLayer_gpu_forward_kernelfunc<<>>( nElement, //to let the nummous w,h,channel, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, //output_b_stride,output_c_stride,output_h_stride,output_w_stride, flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride, input1.data(),input2.data(),input3.data(), flow_output.data() ); })); // THCudaCheck(cudaGetLastError()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in SeparableConvFlowLayer_gpu_forward_kernel: %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } int SeparableConvFlowLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, //const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradflow_output, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ) { int error = 1 ; dim3 grid; dim3 block; //blockthread = 128; //the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z //the three channels are processsed in one kernel block = dim3(BLOCKDIMX,BLOCKDIMY,1); grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch); if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG) printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY); // cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float)); // cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float)); AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] { SeparableConvFlowLayer_gpu_backward_kernelfunc <<>>( nElement, //to let the nummous w,h,channel, filter_size, input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride, input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride, input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride, //output_b_stride,output_c_stride,output_h_stride,output_w_stride, flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride, input1.data(), input2.data(), input3.data(), gradflow_output.data(), gradinput1.data(), gradinput2.data(), gradinput3.data() ); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err)); //THError("aborting"); return error; } error = 0; return error; } ================================================ FILE: my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cuh ================================================ #pragma once #include #include #include int SeparableConvFlowLayer_gpu_forward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, // const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& flow_output ); int SeparableConvFlowLayer_gpu_backward_kernel( cudaStream_t stream, const int nElement, const int w, const int h, const int channel, const int batch, const int filter_size, const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride, const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride, const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride, // const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride, const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride, at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& gradflow_output, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3 ); ================================================ FILE: my_package/SeparableConvFlow/setup.py ================================================ #!/usr/bin/env python3 import os import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension from compiler_args import nvcc_args, cxx_args setup( name='separableconvflow_cuda', ext_modules=[ CUDAExtension('separableconvflow_cuda', [ 'separableconvflow_cuda.cc', 'separableconvflow_cuda_kernel.cu' ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args}) ], cmdclass={ 'build_ext': BuildExtension }) ================================================ FILE: my_package/build.sh ================================================ #!/usr/bin/env bash echo "Need pytorch>=1.0.0" source activate pytorch1.0.0 export PYTHONPATH=$PYTHONPATH:$(pwd) cd MinDepthFlowProjection rm -rf build *.egg-info dist python setup.py install cd .. cd FlowProjection rm -rf build *.egg-info dist python setup.py install cd .. cd SeparableConv rm -rf build *.egg-info dist python setup.py install cd .. cd InterpolationCh rm -rf build *.egg-info dist python setup.py install cd .. cd DepthFlowProjection rm -rf build *.egg-info dist python setup.py install cd .. cd Interpolation rm -rf build *.egg-info dist python setup.py install cd .. cd SeparableConvFlow rm -rf build *.egg-info dist python setup.py install cd .. cd FilterInterpolation rm -rf build *.egg-info dist python setup.py install cd .. ================================================ FILE: my_package/clean.sh ================================================ #!/usr/bin/env bash echo "Need pytorch>=1.0.0" source activate pytorch1.0.0 cd MinDepthFlowProjection rm -rf build *.egg-info dist #python setup.py install cd .. cd FlowProjection rm -rf build *.egg-info dist #python setup.py install cd .. cd SeparableConv rm -rf build *.egg-info dist #python setup.py install cd .. cd InterpolationCh rm -rf build *.egg-info dist #python setup.py install cd .. cd DepthFlowProjection rm -rf build *.egg-info dist #python setup.py install cd .. cd Interpolation rm -rf build *.egg-info dist #python setup.py install cd .. cd SeparableConvFlow rm -rf build *.egg-info dist #python setup.py install cd .. cd FilterInterpolation rm -rf build *.egg-info dist #python setup.py install cd .. ================================================ FILE: my_package/compiler_args.py ================================================ # References: https://developer.nvidia.com/cuda-gpus nvcc_args = [ # Tesla: K80, K80 # Quadro: (None) # NVIDIA NVS: (None) # Jetson: (None) '-gencode', 'arch=compute_37,code=sm_37', # Tesla: (None) # Quadro: K1200, K620, M1200, M520, M5000M, M4000M, M3000M, M2000M, M1000M, K620M, M600M, M500M # NVIDIA NVS: 810 # GeForce / Titan: GTX 750 Ti, GTX 750, GTX 960M, GTX 950M, 940M, 930M, GTX 860M, GTX 850M, 840M, 830M # Jetson: (None) '-gencode', 'arch=compute_50,code=sm_50', # Tesla: M60, M40 # Quadro: M6000 24GB, M6000, M5000, M4000, M2000, M5500M, M2200, M620 # NVIDIA NVS: (None) # GeForce / Titan: GTX TITAN X, GTX 980 Ti, GTX 980, GTX 970, GTX 960, GTX 950, GTX 980, GTX 980M, GTX 970M, GTX 965M, 910M # Jetson: (None) '-gencode', 'arch=compute_52,code=sm_52', # Tesla: P100 # Quadro: GP100 # NVIDIA: NVS: (None) # GeForce / Titan: (None) # Jetson: (None) '-gencode', 'arch=compute_60,code=sm_60', # Tesla: P40, P4 # Quadro: P6000, P5000, P4000, P2200, P2000, P1000, P620, P600, P400, P620, P520, P5200, P4200, P3200, P5000, P4000, P3000, P2000, P1000, P600, P500 # NVIDIA NVS: (None) # GeForce / Titan: TITAN Xp, TITAN X, GTX 1080 Ti, GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1080, GTX 1070, GTX 1060 # Jetson: (None) '-gencode', 'arch=compute_61,code=sm_61', # Tesla: T4 # Quadro: RTX 8000, RTX 6000, RTX 5000, RTX 4000, RTX 5000, RTX 4000, RTX 3000, T2000, T1000 # NVIDIA NVS: (None) # GeForce / Titan: TITAN RTX, RTX 2080 Ti, RTX 2080, RTX 2070, RTX 2060, RTX 2080, RTX 2070, RTX 2060 # Jetson: (None) '-gencode', 'arch=compute_75,code=sm_75', # '-gencode', 'arch=compute_70,code=sm_70', # '-gencode', 'arch=compute_70,code=compute_70' '-w' # Ignore compiler warnings. ] cxx_args = ['-std=c++11', '-w'] ================================================ FILE: my_package/test_module.py ================================================ # main.py import torch import torch.nn as nn from torch.autograd import Variable from torch.autograd import gradcheck #from modules.InterpolationModule import InterpolationModule #from modules.FilterInterpolationModule import FilterInterpolationModule #from modules.FlowProjectionModule import FlowProjectionModule from my_package.DepthFlowProjection import DepthFlowProjectionModule #from modules.FilterInterpolationModule import AdaptiveWeightInterpolationModule #from modules.SeparableConvModule import SeparableConvModule import time import numpy #from modules.InterpolationChModule import InterpolationChModule #from modules.WeigtedFlowProjectionModule import WeightedFlowProjectionModule #from modules.SeparableConvFlowModule import SeparableConvFlowModule def test_SeparableConvFlowModule(input1, input2, input3,filtersize): FilterInterpolate = SeparableConvFlowModule(filtersize) t1 = time.time() output = FilterInterpolate(input1, input2, input3) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True) input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True) t1 = time.time() FilterInterpolate.zero_grad() # to clean up the gradient in the last backward output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...", end='\t') x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda()) x = torch.max(torch.abs(x)) # print(x) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(output_cuda - output.cuda())) else: print("output pass", end='\n') # x = (flow_cuda - flow.cuda() ) * 2 / (torch.abs(flow_cuda) + torch.abs(flow).cuda() ) # x = torch.max(torch.abs(x)) # # print(x) # # if (x.cpu().data.numpy()[0] > 1e-6): # print(x) # else: # print("flow pass", end='\n') # print("Check the backward path between CPU and GPU...", end='\t') # x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda()) # # y = x.cpu().data.numpy() # x = torch.max(torch.abs(x)) # # print(x) # # if (x.cpu().data.numpy()[0] > 1e-6): # print(x) # print(torch.mean(input1_cuda.grad - input1.grad.cuda())) # else: # print("pass", end='\t') x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda()) y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input2_cuda.grad - input2.grad.cuda())) else: print("pass", end='\t') x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda()) y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input3_cuda.grad - input3.grad.cuda())) else: print("pass", end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) return t2 - t1, t3 - t2 def test_SeparableConvModule(input1, input2, input3,filtersize): FilterInterpolate = SeparableConvModule(filtersize) t1 = time.time() output = FilterInterpolate(input1, input2, input3) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True) input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True) t1 = time.time() FilterInterpolate.zero_grad() # to clean up the gradient in the last backward output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...", end='\t') x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda()) x = torch.max(torch.abs(x)) # print(x) if (x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass", end='\n') print("Check the backward path between CPU and GPU...", end='\t') x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda()) y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) # print(x) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input1_cuda.grad - input1.grad.cuda())) else: print("pass", end='\t') x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda()) y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input2_cuda.grad - input2.grad.cuda())) else: print("pass", end='\t') x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda()) y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input3_cuda.grad - input3.grad.cuda())) else: print("pass", end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) return t2 - t1, t3 - t2 def test_FilterInterpolation(input1,input2,input3): FilterInterpolate = FilterInterpolationModule() t1 = time.time() output = FilterInterpolate(input1, input2, input3) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True) input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True) t1 = time.time() FilterInterpolate.zero_grad()# to clean up the gradient in the last backward output_cuda = FilterInterpolate(input1_cuda, input2_cuda ,input3_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...", end='\t') x = output_cuda - output.cuda() x = torch.max(torch.abs(x)) # print(x) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass", end='\n') print("Check the backward path between CPU and GPU...", end='\t') x = input1_cuda.grad - input1.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) # print(x) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input1_cuda.grad - input1.grad.cuda())) else: print("pass", end='\t') x = input2_cuda.grad - input2.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input2_cuda.grad - input2.grad.cuda())) else: print("pass", end='\t') x = input3_cuda.grad - input3.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input3_cuda.grad - input3.grad.cuda())) else: print("pass", end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) return t2-t1,t3-t2 def test_InterpolationModule(input1,input2): # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True) # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True) # input2 = Variable(torch.zeros(12,2,64,64)) # input2 = Variable(torch.ones(12,2,64,64) * (-2.1)) # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_() # input2.data.uniform_(-5,5) Interpolate = InterpolationModule() t1 = time.time() output = Interpolate(input1,input2) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True) t1 = time.time() output_cuda = Interpolate(input1_cuda,input2_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...",end='\t') x = output_cuda - output.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') print("Check the backward path between CPU and GPU...",end='\t') x = input1_cuda.grad - input1.grad.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\t') x = input2_cuda.grad - input2.grad.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) return t2-t1,t3-t2 def test_InterpolationChModule(input1,input2): # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True) # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True) # input2 = Variable(torch.zeros(12,2,64,64)) # input2 = Variable(torch.ones(12,2,64,64) * (-2.1)) # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_() # input2.data.uniform_(-5,5) Interpolate = InterpolationChModule(input1.size(1)) t1 = time.time() output = Interpolate(input1,input2) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True) t1 = time.time() output_cuda = Interpolate(input1_cuda,input2_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...",end='\t') x = output_cuda - output.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') print("Check the backward path between CPU and GPU...",end='\t') x = input1_cuda.grad - input1.grad.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\t') x = input2_cuda.grad - input2.grad.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) return t2-t1,t3-t2 def test_FlowProjectionModule(input1): # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True) # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True) # input2 = Variable(torch.zeros(12,2,64,64)) # input2 = Variable(torch.ones(12,2,64,64) * (-2.1)) # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_() # input2.data.uniform_(-5,5) Project = FlowProjectionModule() t1 = time.time() output = Project(input1) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True) # input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True) Project = FlowProjectionModule() # regnenerate t1 = time.time() output_cuda = Project(input1_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...",end='\t') x = output_cuda - output.cuda() # print(output_cuda[0, 0, :10, :10]) # print(output[0, 0, :10, :10]) # print(x[0, 0, :10, :10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') print("Check the backward path between CPU and GPU...",end='\t') x = input1_cuda.grad - input1.grad.cuda() # print(input1_cuda[0,0,:10,:10]) # print(input1[0,0,:10,:10]) # print(x[0,0,:10,:10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda()))) print(torch.mean((input1_cuda.grad - input1.grad.cuda()))) else: print("pass",end='\t') # x = input2_cuda.grad - input2.grad.cuda() # x = torch.max(torch.abs(x)) # if(x.cpu().data.numpy()[0] > 1e-6): # print(x) # else: # print("pass",end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) print("\n\n") return t2-t1,t3-t2 def test_DepthFlowProjectionModule(input1,input2): # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True) # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True) # input2 = Variable(torch.zeros(12,2,64,64)) # input2 = Variable(torch.ones(12,2,64,64) * (-2.1)) # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_() # input2.data.uniform_(-5,5) # Project = DepthFlowProjectionModule() # t1 = time.time() # output = Project(input1,input2) # t2 = time.time() # output.backward(output.data) # t3 = time.time() # print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) # temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True) Project = DepthFlowProjectionModule(input1_cuda.requires_grad) # regnenerate t1 = time.time() output_cuda = Project(input1_cuda,input2_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...",end='\t') x = output_cuda - output.cuda() # print(output_cuda[0, 0, :10, :10]) # print(output[0, 0, :10, :10]) # print(x[0, 0, :10, :10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') print("Check the backward path between CPU and GPU...",end='\t') x = input1_cuda.grad - input1.grad.cuda() # print(input1_cuda[0,0,:10,:10]) # print(input1[0,0,:10,:10]) # print(x[0,0,:10,:10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda()))) print(torch.mean((input1_cuda.grad - input1.grad.cuda()))) else: print("pass",end='\t') x = input2_cuda.grad - input2.grad.cuda() x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) print("\n\n") return t2-t1,t3-t2 def test_WeightedFlowProjectionModule(input1 , input2, input3): # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True) # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True) # input2 = Variable(torch.zeros(12,2,64,64)) # input2 = Variable(torch.ones(12,2,64,64) * (-2.1)) # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_() # input2.data.uniform_(-5,5) # Project = FlowProjectionModule() Project = WeightedFlowProjectionModule(threshold=20.0/255.0,requires_grad=True) t1 = time.time() output = Project(input1,input2,input3) t2 = time.time() output.backward(output.data) t3 = time.time() print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True) input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True) Project = WeightedFlowProjectionModule(threshold=20.0/255.0, requires_grad=True) # regnenerate t1 = time.time() output_cuda = Project(input1_cuda,input2_cuda,input3_cuda) t2 = time.time() output_cuda.backward(output_cuda.data) t3 = time.time() print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t") # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...",end='\t') x = output_cuda - output.cuda() # print(output_cuda[0, 0, :10, :10]) # print(output[0, 0, :10, :10]) # print(x[0, 0, :10, :10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass",end='\n') print("Check the backward path between CPU and GPU...",end='\t') x = input1_cuda.grad - input1.grad.cuda() # print(input1_cuda[0,0,:10,:10]) # print(input1[0,0,:10,:10]) # print(x[0,0,:10,:10]) x = torch.max(torch.abs(x)) if(x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda()))) print(torch.mean((input1_cuda.grad - input1.grad.cuda()))) else: print("pass",end='\t') # x = input2_cuda.grad - input2.grad.cuda() # x = torch.max(torch.abs(x)) # if(x.cpu().data.numpy()[0] > 1e-6): # print(x) # else: # print("pass",end='\n') # print(x[0,0,...]) # print(x[0,1,...]) # print(x[0,2,...]) # # print(torch.max(x)) # print(x[11,2,...]) print("\n\n") return t2-t1,t3-t2 def test_AdaptiveWeightInterpolationModule(input1, input2, input3, input4): training = True Interpolate = AdaptiveWeightInterpolationModule(training=training) #gradcheck(Interpolate,) t1 = time.time() output = Interpolate(input1, input2, input3, input4) t2 = time.time() if training: #output.backward(output.data) grad = output.data # grad = grad.zero_() output.backward(grad) print( input3.grad) t3 = time.time() print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # # print(output) # print(input1.grad.size()) # print(input1.grad) # print(output[3,0,...]) temp = input1.grad # input1 = input1.cuda() # input2 = input2.cuda() # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True) # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True) input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True) input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True) input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True) input4_cuda = Variable(input4.data.type(torch.cuda.FloatTensor), requires_grad=True ) t1 = time.time() Interpolate.zero_grad() # to clean up the gradient in the last backward output_cuda = Interpolate(input1_cuda, input2_cuda, input3_cuda,input4_cuda) t2 = time.time() if training : # output_cuda.backward(output_cuda.data) grad = output_cuda.data # grad = grad.zero_() output_cuda.backward(grad) t3 = time.time() print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t") # return # print(output_cuda) # print(input1_cuda.grad.size()) # print(input1_cuda.grad) # print(output_cuda[3,0,...]) # print(output[3,0,...]- output_cuda[3,0,...].cpu()) # print(output_cuda - output.cuda()) # print(input1_cuda.grad - input1.grad.cuda()) print("Check the forward path between CPU and GPU...", end='\n') x = output_cuda - output.cuda() #print(x) #print(x>1e-6) print("==>total number of difference") print(torch.sum(torch.abs(x) > 1e-6)) x = torch.max(torch.abs(x)) print("==>max difference value is ") print(x) print(torch.sum(output_cuda > 1) ) print(torch.sum(output.cuda() > 1)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) else: print("pass", end='\n') if not training: return t2 - t1, t3 - t2 print("Check the backward path between CPU and GPU...", end='\t') y = input1_cuda.grad - input1.grad.cuda() x = y.cpu().data.numpy() #print(x>1e-6) x = torch.max(torch.abs(y)) print(x) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input1_cuda.grad - input1.grad.cuda())) else: print("pass", end='\t') x = input2_cuda.grad - input2.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input2_cuda.grad - input2.grad.cuda())) else: print("pass", end='\t') x = input3_cuda.grad - input3.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input3_cuda.grad - input3.grad.cuda())) else: print("pass", end='\n') x = input4_cuda.grad - input4.grad.cuda() y = x.cpu().data.numpy() x = torch.max(torch.abs(x)) if (x.cpu().data.numpy()[0] > 1e-6): print(x) print(torch.mean(input4_cuda.grad - input4.grad.cuda())) else: print("pass", end='\n') return t2 - t1, t3 - t2 # # # # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor)) # # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor)) # # B,H,W = 1,16,16 # # B,C,H,W = 2,64,32,32 # # filtersize = 4 # # input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True) # # input2 = Variable(torch.rand(B, 2, H, W), requires_grad=True) # # input3 = Variable(torch.rand(B, filtersize**2, H, W), requires_grad=True) # #input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True) # # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True) # # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True) # B,C,H,W = 1,3,128,128 # filtersize = 51 # input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True) # input2 = Variable(torch.zeros(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True) # input3 = Variable(torch.ones(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True) # # # input1 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3,H,W), requires_grad=True) # # input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True) # # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True) # # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True) # # input2 = Variable(torch.zeros(12,2,64,64),requires_grad = True) # # input3 = Variable(torch.ones(12,16,64,64),requires_grad = True) # # input2 = Variable(torch.ones(12,///2,64,64) * (-2.1)) # # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1)) # input1.data.uniform_(0, 1) # input2.data.uniform_(0, 1) # input3.data.uniform_(0, 1) # not have to be normalized to 1.0 # # input4.data.uniform_(-1,1) # # # # # # ftimes = [] # # btimes = [] # # for i in range(10): # # input1.data.uniform_(0, 1) # # input2.data.uniform_(-1, 1) # # input3.data.uniform_(0,1) # # input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule # # input2 = Variable(input2.clone().data, requires_grad = True) # # input3 = Variable(input3.clone().data, requires_grad = True) # # ftime, btime = test_FilterInterpolation(input1,input2,input3) # # ftimes.append(ftime) # # btimes.append(btime) # # # # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n") # # # nn.LogSoftmax # # exit(0) # # ftimes = [] # # btimes = [] # # for i in range(10): # # input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule # # input2 = Variable(input2.clone().data, requires_grad = True) # # ftime, btime = test_InterpolationModule(input1,input2) # # ftimes.append(ftime) # # btimes.append(btime) # # # # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n") # # # # ftimes = [] # # btimes = [] # # for i in range(10): # # input1.data.uniform_(0, 1) # # input2.data.uniform_(-16, 17) # # input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule # # input2 = Variable(input2.clone().data, requires_grad = True) # # ftime, btime = test_InterpolationChModule(input1,input2) # # ftimes.append(ftime) # # btimes.append(btime) # # # # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n") # # # nn.LogSoftmax # # exit(0) # # # ftimes = [] # btimes = [] # for i in range(3): # input1.data.uniform_(0.0, 1) # input2.data.uniform_(1.0/filtersize, 1.1/filtersize) # input3.data.uniform_(1.0/filtersize, 1.1/filtersize) # not have to be normalized to 1.0 # # input1 = Variable(input1.clone().data, requires_grad=True) # to delete the graph in InterpolationModule # input2 = Variable(input2.clone().data, requires_grad=True) # input3 = Variable(input3.clone().data, requires_grad=True) # # ftime, btime = test_SeparableConvModule(input1, input2, input3,filtersize) # ftime, btime = test_SeparableConvFlowModule(input1, input2, input3,filtersize) # ftimes.append(ftime) # btimes.append(btime) # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) + "s\t" + str( # numpy.array(btimes).mean()) + "s\t") # exit(0) # # # # # for i in range(10): # # input1.data.uniform_(0.14, 0.405) # # input2.data.uniform_(0.14, 0.405) # # input3.data.uniform_(0.2, 0.501) # not have to be normalized to 1.0 # # input4.data.uniform_(0.2, 0.501) # # # # input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule # # input2 = Variable(input2.clone().data, requires_grad = True) # # input3 = Variable(input3.clone().data, requires_grad = True) # # input4 = Variable(input4.clone().data, requires_grad = True) # # ftime,btime = test_AdaptiveWeightInterpolationModule(input1,input2,input3,input4) # # ftimes.append(ftime) # # btimes.append(btime) # # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t") # # # input1 = Variable(torch.arange(0.0, 12 * 2 * 64 * 64).view(12, 2, 64, 64), requires_grad=True) # input1.data.uniform_(-1.0,1.0) # # input1 = Variable( - 0.5 * torch.ones(12,2,64,64).type(torch.FloatTensor), requires_grad = True) # # # B,C,H,W = 1,2,512,704 input1 = Variable(torch.arange(0.0, B*C * H * W).view(B, C, H, W), requires_grad=True) input3 = Variable(torch.arange(0.0, B* 3 * H * W).view(B,3, H,W), requires_grad = True) # input2 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3 ,H,W), requires_grad=True) input2 = Variable(torch.arange(0.0, B * 1 * H * W).view(B, 1 ,H,W), requires_grad=True) ftimes = [] btimes = [] for i in range(10): input1.data.uniform_(-1.0, 1.0) input2.data.uniform_(0.1, 1.0) # must be larger than zero # input3.data.uniform_(0.0, 1.0) input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule input2 = Variable(input2.clone().data, requires_grad = True) # ftime, btime = test_FlowProjectionModule(input1) ftime,btime =test_DepthFlowProjectionModule(input1,input2) ftimes.append(ftime) btimes.append(btime) print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n") exit(0) ftimes = [] btimes = [] for i in range(10): input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule input2 = Variable(input2.clone().data, requires_grad = True) input3 = Variable(input3.clone().data, requires_grad = True) ftime, btime = test_WeightedFlowProjectionModule(input1,input2,input3) ftimes.append(ftime) btimes.append(btime) print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n") ================================================ FILE: networks/DAIN.py ================================================ # -*- coding: utf-8 -*- import torch import torch.nn as nn from my_package.FilterInterpolation import FilterInterpolationModule from my_package.FlowProjection import FlowProjectionModule #,FlowFillholeModule from my_package.DepthFlowProjection import DepthFlowProjectionModule from Stack import Stack import PWCNet import S2D_models import Resblock import MegaDepth import time class DAIN(torch.nn.Module): def __init__(self, channel = 3, filter_size = 4, timestep=0.5, training=True): # base class initialization super(DAIN, self).__init__() self.filter_size = filter_size self.training = training self.timestep = timestep assert (timestep == 0.5) # TODO: or else the WeigtedFlowProjection should also be revised... Really Tedious work. self.numFrames =int(1.0/timestep) - 1 i=0 self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \ self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, "filter") self.ctxNet = S2D_models.__dict__['S2DF_3dense']() self.ctx_ch = 3 * 64 + 3 self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128) self._initialize_weights() if self.training: self.flownets = PWCNet.__dict__['pwc_dc_net']("PWCNet/pwc_net.pth.tar") else: self.flownets = PWCNet.__dict__['pwc_dc_net']() self.div_flow = 20.0 #extract depth information if self.training: self.depthNet=MegaDepth.__dict__['HourGlass']("MegaDepth/checkpoints/test_local/best_generalization_net_G.pth") else: self.depthNet=MegaDepth.__dict__['HourGlass']() return def _initialize_weights(self): count = 0 for m in self.modules(): if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) # print(m) count+=1 # print(count) # weight_init.xavier_uniform(m.weight.data) nn.init.xavier_uniform_(m.weight.data) # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in') if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01) m.bias.data.zero_() # else: # print(m) def forward(self, input): """ Parameters ---------- input: shape (3, batch, 3, width, height) ----------- """ losses = [] offsets= [] filters = [] occlusions = [] device = torch.cuda.current_device() # s1 = torch.cuda.Stream(device=device, priority=5) # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority s1 = torch.cuda.current_stream() s2 = torch.cuda.current_stream() ''' STEP 1: sequeeze the input ''' if self.training == True: assert input.size(0) == 3 input_0,input_1,input_2 = torch.squeeze(input,dim=0) else: assert input.size(0) ==2 input_0,input_2 = torch.squeeze(input,dim=0) #prepare the input data of current scale cur_input_0 = input_0 if self.training == True: cur_input_1 = input_1 cur_input_2 = input_2 ''' STEP 3.2: concatenating the inputs. ''' cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1) cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1) ''' STEP 3.3: perform the estimation by the Three subpath Network ''' time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)] with torch.cuda.stream(s1): temp = self.depthNet(torch.cat((cur_filter_input[:, :3, ...], cur_filter_input[:, 3:, ...]),dim=0)) log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]] cur_ctx_output = [ torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]), log_depth[0].detach()), dim=1), torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]), log_depth[1].detach()), dim=1) ] temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter') cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None), self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)] depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth] with torch.cuda.stream(s2): for _ in range(1): cur_offset_outputs = [ self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets), self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...], cur_offset_input[:, 0:3, ...]), dim=1), time_offsets=time_offsets[::-1]) ] torch.cuda.synchronize() #synchronize s1 and s2 cur_offset_outputs = [ self.FlowProject(cur_offset_outputs[0],depth_inv[0]), self.FlowProject(cur_offset_outputs[1],depth_inv[1]) ] ''' STEP 3.4: perform the frame interpolation process ''' cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]] ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1], cur_offset_output,cur_filter_output) cur_output,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output,cur_filter_output,self.filter_size**2) rectify_input = torch.cat((cur_output,ref0,ref2, cur_offset_output[0],cur_offset_output[1], cur_filter_output[0],cur_filter_output[1], ctx0,ctx2 ),dim =1) cur_output_rectified = self.rectifyNet(rectify_input) + cur_output ''' STEP 3.5: for training phase, we collect the variables to be penalized. ''' if self.training == True: losses +=[cur_output - cur_input_1] losses += [cur_output_rectified - cur_input_1] offsets +=[cur_offset_output] filters += [cur_filter_output] ''' STEP 4: return the results ''' if self.training == True: # if in the training phase, we output the losses to be minimized. # return losses, loss_occlusion return losses, offsets,filters,occlusions else: cur_outputs = [cur_output,cur_output_rectified] return cur_outputs,cur_offset_output,cur_filter_output def forward_flownets(self, model, input, time_offsets = None): if time_offsets == None : time_offsets = [0.5] elif type(time_offsets) == float: time_offsets = [time_offsets] elif type(time_offsets) == list: pass temp = model(input) # this is a single direction motion results, but not a bidirectional one temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it. temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp) for temp in temps]# nearest interpolation won't be better i think return temps '''keep this function''' def forward_singlePath(self, modulelist, input, name): stack = Stack() k = 0 temp = [] for layers in modulelist: # self.initScaleNets_offset: # print(type(layers).__name__) # print(k) # if k == 27: # print(k) # pass # use the pop-pull logic, looks like a stack. if k == 0: temp = layers(input) else: # met a pooling layer, take its input if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d): stack.push(temp) temp = layers(temp) # met a unpooling layer, take its output if isinstance(layers, nn.Upsample): if name == 'offset': temp = torch.cat((temp,stack.pop()),dim=1) # short cut here, but optical flow should concat instead of add else: temp += stack.pop() # short cut here, but optical flow should concat instead of add k += 1 return temp '''keep this funtion''' def get_MonoNet5(self, channel_in, channel_out, name): ''' Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion. :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result :param channel_out: number of output the offset or filter or occlusion :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer :return: output the network model ''' model = [] # block1 model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1)) model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.5 # block2 model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.4 # block3 model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.3 # block4 model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.2 # block5 model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2)) # intermediate block5_5 model += self.conv_relu(512, 512, (3, 3), (1, 1)) # block 6 model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2) # THE OUTPUT No.1 UP # block 7 model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2) # THE OUTPUT No.2 UP # block 8 model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2) # THE OUTPUT No.3 UP # block 9 model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2) # THE OUTPUT No.4 UP # block 10 model += self.conv_relu_unpool(32, 16, (3, 3), (1, 1), 2) # THE OUTPUT No.5 UP # output our final purpose branch1 = [] branch2 = [] branch1 += self.conv_relu_conv(16, channel_out, (3, 3), (1, 1)) branch2 += self.conv_relu_conv(16, channel_out, (3, 3), (1, 1)) return (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2)) '''keep this function''' @staticmethod def FlowProject(inputs, depth = None): if depth is not None: outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs] else: outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs] return outputs '''keep this function''' @staticmethod def FilterInterpolate_ctx(ctx0,ctx2,offset,filter): ##TODO: which way should I choose ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach()) ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach()) return ctx0_offset, ctx2_offset # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0]) # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1]) # # return ctx0_offset, ctx2_offset '''Keep this function''' @staticmethod def FilterInterpolate(ref0, ref2, offset, filter,filter_size2): ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0]) ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1]) return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset '''keep this function''' @staticmethod def conv_relu_conv(input_filter, output_filter, kernel_size, padding): # we actually don't need to use so much layer in the last stages. layers = nn.Sequential( nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding), nn.ReLU(inplace=False), nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding), # nn.ReLU(inplace=False), # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding), # nn.ReLU(inplace=False), # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding), ) return layers '''keep this fucntion''' @staticmethod def conv_relu(input_filter, output_filter, kernel_size, padding): layers = nn.Sequential(*[ nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False) ]) return layers '''keep this function''' @staticmethod def conv_relu_maxpool(input_filter, output_filter, kernel_size, padding,kernel_size_pooling): layers = nn.Sequential(*[ nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False), # nn.BatchNorm2d(output_filter), nn.MaxPool2d(kernel_size_pooling) ]) return layers '''klkeep this function''' @staticmethod def conv_relu_unpool(input_filter, output_filter, kernel_size, padding,unpooling_factor): layers = nn.Sequential(*[ nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'), nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False), # nn.BatchNorm2d(output_filter), # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0]) ]) return layers ================================================ FILE: networks/DAIN_slowmotion.py ================================================ # -*- coding: utf-8 -*- import torch import torch.nn as nn from my_package.FilterInterpolation import FilterInterpolationModule from my_package.FlowProjection import FlowProjectionModule #,FlowFillholeModule from my_package.DepthFlowProjection import DepthFlowProjectionModule from Stack import Stack import PWCNet import S2D_models import Resblock import MegaDepth import time class DAIN_slowmotion(torch.nn.Module): def __init__(self, channel = 3, filter_size = 4, timestep=0.5, training=True): # base class initialization super(DAIN_slowmotion, self).__init__() self.filter_size = filter_size self.training = training self.timestep = timestep self.numFrames =int(1.0/timestep) - 1 print("Interpolate " +str( self.numFrames )+ " frames") i = 0 self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \ self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, "filter") self.ctxNet = S2D_models.__dict__['S2DF_3dense']() self.ctx_ch = 3 * 64 + 3 self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128) self._initialize_weights() if self.training: self.flownets = PWCNet.__dict__['pwc_dc_net']("PWCNet/pwc_net.pth.tar") else: self.flownets = PWCNet.__dict__['pwc_dc_net']() self.div_flow = 20.0 #extract depth information if self.training: self.depthNet=MegaDepth.__dict__['HourGlass']("MegaDepth/checkpoints/test_local/best_generalization_net_G.pth") else: self.depthNet=MegaDepth.__dict__['HourGlass']() return def _initialize_weights(self): count = 0 for m in self.modules(): if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) # print(m) count+=1 # print(count) # weight_init.xavier_uniform(m.weight.data) nn.init.xavier_uniform_(m.weight.data) # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in') if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01) m.bias.data.zero_() # else: # print(m) def forward(self, input): """ Parameters ---------- input: shape (3, batch, 3, width, height) ----------- """ losses = [] offsets= [] filters = [] occlusions = [] device = torch.cuda.current_device() # s1 = torch.cuda.Stream(device=device, priority=5) # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority s1 = torch.cuda.current_stream() s2 = torch.cuda.current_stream() ''' STEP 1: sequeeze the input ''' if self.training == True: assert input.size(0) == 3 input_0,input_1,input_2 = torch.squeeze(input,dim=0) else: assert input.size(0) ==2 input_0,input_2 = torch.squeeze(input,dim=0) #prepare the input data of current scale cur_input_0 = input_0 if self.training == True: cur_input_1 = input_1 cur_input_2 = input_2 ''' STEP 3.2: concatenating the inputs. ''' cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1) cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1) ''' STEP 3.3: perform the estimation by the Three subpath Network ''' time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)] with torch.cuda.stream(s1): temp = self.depthNet(torch.cat((cur_filter_input[:, :3, ...], cur_filter_input[:, 3:, ...]),dim=0)) log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]] cur_ctx_output = [ torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]), log_depth[0].detach()), dim=1), torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]), log_depth[1].detach()), dim=1) ] temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter') cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None), self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)] depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth] with torch.cuda.stream(s2): for _ in range(1): cur_offset_outputs = [ self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets), self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...], cur_offset_input[:, 0:3, ...]), dim=1), time_offsets=[1 - t for t in time_offsets]) ] torch.cuda.synchronize() #synchronize s1 and s2 cur_offset_outputs = [ self.FlowProject(cur_offset_outputs[0],depth_inv[0]), self.FlowProject(cur_offset_outputs[1],depth_inv[1]) ] ''' STEP 3.4: perform the frame interpolation process ''' cur_output_rectified = [] cur_output = [] for temp_0,temp_1, timeoffset in zip(cur_offset_outputs[0], cur_offset_outputs[1], time_offsets): cur_offset_output = [temp_0,temp_1] #[cur_offset_outputs[0][0], cur_offset_outputs[1][0]] ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1], cur_offset_output,cur_filter_output, timeoffset) cur_output_temp ,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output, cur_filter_output,self.filter_size**2, timeoffset) cur_output.append(cur_output_temp) rectify_input = torch.cat((cur_output_temp,ref0,ref2, cur_offset_output[0],cur_offset_output[1], cur_filter_output[0],cur_filter_output[1], ctx0,ctx2 ),dim =1) cur_output_rectified_temp = self.rectifyNet(rectify_input) + cur_output_temp cur_output_rectified.append(cur_output_rectified_temp) ''' STEP 3.5: for training phase, we collect the variables to be penalized. ''' if self.training == True: losses +=[cur_output - cur_input_1] losses += [cur_output_rectified - cur_input_1] offsets +=[cur_offset_output] filters += [cur_filter_output] ''' STEP 4: return the results ''' if self.training == True: # if in the training phase, we output the losses to be minimized. # return losses, loss_occlusion return losses, offsets,filters,occlusions else: cur_outputs = [cur_output,cur_output_rectified] return cur_outputs,cur_offset_output,cur_filter_output def forward_flownets(self, model, input, time_offsets = None): if time_offsets == None : time_offsets = [0.5] elif type(time_offsets) == float: time_offsets = [time_offsets] elif type(time_offsets) == list: pass temp = model(input) # this is a single direction motion results, but not a bidirectional one temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it. temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp) for temp in temps]# nearest interpolation won't be better i think return temps '''keep this function''' def forward_singlePath(self, modulelist, input, name): stack = Stack() k = 0 temp = [] for layers in modulelist: # self.initScaleNets_offset: # print(type(layers).__name__) # print(k) # if k == 27: # print(k) # pass # use the pop-pull logic, looks like a stack. if k == 0: temp = layers(input) else: # met a pooling layer, take its input if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d): stack.push(temp) temp = layers(temp) # met a unpooling layer, take its output if isinstance(layers, nn.Upsample): if name == 'offset': temp = torch.cat((temp,stack.pop()),dim=1) # short cut here, but optical flow should concat instead of add else: temp += stack.pop() # short cut here, but optical flow should concat instead of add k += 1 return temp '''keep this funtion''' def get_MonoNet5(self, channel_in, channel_out, name): ''' Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion. :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result :param channel_out: number of output the offset or filter or occlusion :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer :return: output the network model ''' model = [] # block1 model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1)) model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.5 # block2 model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.4 # block3 model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.3 # block4 model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2)) # THE OUTPUT No.2 # block5 model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2)) # intermediate block5_5 model += self.conv_relu(512, 512, (3, 3), (1, 1)) # block 6 model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2) # THE OUTPUT No.1 UP # block 7 model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2) # THE OUTPUT No.2 UP # block 8 model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2) # THE OUTPUT No.3 UP # block 9 model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2) # THE OUTPUT No.4 UP # block 10 model += self.conv_relu_unpool(32, 16, (3, 3), (1, 1), 2) # THE OUTPUT No.5 UP # output our final purpose branch1 = [] branch2 = [] branch1 += self.conv_relu_conv(16, channel_out, (3, 3), (1, 1)) branch2 += self.conv_relu_conv(16, channel_out, (3, 3), (1, 1)) return (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2)) '''keep this function''' @staticmethod def FlowProject(inputs, depth = None): if depth is not None: outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs] else: outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs] return outputs '''keep this function''' @staticmethod def FilterInterpolate_ctx(ctx0,ctx2,offset,filter, timeoffset): ##TODO: which way should I choose ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach()) ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach()) return ctx0_offset, ctx2_offset # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0]) # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1]) # # return ctx0_offset, ctx2_offset '''Keep this function''' @staticmethod def FilterInterpolate(ref0, ref2, offset, filter,filter_size2, time_offset): ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0]) ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1]) # occlusion0, occlusion2 = torch.split(occlusion, 1, dim=1) # print((occlusion0[0,0,1,1] + occlusion2[0,0,1,1])) # output = (occlusion0 * ref0_offset + occlusion2 * ref2_offset) / (occlusion0 + occlusion2) # output = * ref0_offset + occlusion[1] * ref2_offset # automatically broadcasting the occlusion to the three channels of and image. # return output # return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset return ref0_offset*(1.0 - time_offset) + ref2_offset*(time_offset), ref0_offset, ref2_offset '''keep this function''' @staticmethod def conv_relu_conv(input_filter, output_filter, kernel_size, padding): # we actually don't need to use so much layer in the last stages. layers = nn.Sequential( nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding), nn.ReLU(inplace=False), nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding), # nn.ReLU(inplace=False), # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding), # nn.ReLU(inplace=False), # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding), ) return layers '''keep this fucntion''' @staticmethod def conv_relu(input_filter, output_filter, kernel_size, padding): layers = nn.Sequential(*[ nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False) ]) return layers '''keep this function''' @staticmethod def conv_relu_maxpool(input_filter, output_filter, kernel_size, padding,kernel_size_pooling): layers = nn.Sequential(*[ nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False), # nn.BatchNorm2d(output_filter), nn.MaxPool2d(kernel_size_pooling) ]) return layers '''klkeep this function''' @staticmethod def conv_relu_unpool(input_filter, output_filter, kernel_size, padding,unpooling_factor): layers = nn.Sequential(*[ nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'), nn.Conv2d(input_filter,output_filter,kernel_size,1, padding), nn.ReLU(inplace=False), # nn.BatchNorm2d(output_filter), # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0]) ]) return layers ================================================ FILE: networks/__init__.py ================================================ from .DAIN import DAIN from .DAIN_slowmotion import DAIN_slowmotion __all__ = ( 'DAIN', 'DAIN_slowmotion' ) ================================================ FILE: train.py ================================================ import sys import os import threading import torch from torch.autograd import Variable import torch.utils.data from lr_scheduler import * import numpy from AverageMeter import * from loss_function import * import datasets import balancedsampler import networks from my_args import args def train(): torch.manual_seed(args.seed) model = networks.__dict__[args.netName](channel=args.channels, filter_size = args.filter_size , timestep=args.time_step, training=True) if args.use_cuda: print("Turn the model into CUDA") model = model.cuda() if not args.SAVED_MODEL==None: # args.SAVED_MODEL ='../model_weights/'+ args.SAVED_MODEL + "/best" + ".pth" args.SAVED_MODEL ='./model_weights/best.pth' print("Fine tuning on " + args.SAVED_MODEL) if not args.use_cuda: pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage) # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)) else: pretrained_dict = torch.load(args.SAVED_MODEL) # model.load_state_dict(torch.load(args.SAVED_MODEL)) #print([k for k,v in pretrained_dict.items()]) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) pretrained_dict = None if type(args.datasetName) == list: train_sets, test_sets = [],[] for ii, jj in zip(args.datasetName, args.datasetPath): tr_s, te_s = datasets.__dict__[ii](jj, split = args.dataset_split,single = args.single_output, task = args.task) train_sets.append(tr_s) test_sets.append(te_s) train_set = torch.utils.data.ConcatDataset(train_sets) test_set = torch.utils.data.ConcatDataset(test_sets) else: train_set, test_set = datasets.__dict__[args.datasetName](args.datasetPath) train_loader = torch.utils.data.DataLoader( train_set, batch_size = args.batch_size, sampler=balancedsampler.RandomBalancedSampler(train_set, int(len(train_set) / args.batch_size )), num_workers= args.workers, pin_memory=True if args.use_cuda else False) val_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True if args.use_cuda else False) print('{} samples found, {} train samples and {} test samples '.format(len(test_set)+len(train_set), len(train_set), len(test_set))) # if not args.lr == 0: print("train the interpolation net") optimizer = torch.optim.Adamax([ {'params': model.initScaleNets_filter.parameters(), 'lr': args.filter_lr_coe * args.lr}, {'params': model.initScaleNets_filter1.parameters(), 'lr': args.filter_lr_coe * args.lr}, {'params': model.initScaleNets_filter2.parameters(), 'lr': args.filter_lr_coe * args.lr}, {'params': model.ctxNet.parameters(), 'lr': args.ctx_lr_coe * args.lr}, {'params': model.flownets.parameters(), 'lr': args.flow_lr_coe * args.lr}, {'params': model.depthNet.parameters(), 'lr': args.depth_lr_coe * args.lr}, {'params': model.rectifyNet.parameters(), 'lr': args.rectify_lr} ], lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, 'min',factor=args.factor, patience=args.patience,verbose=True) print("*********Start Training********") print("LR is: "+ str(float(optimizer.param_groups[0]['lr']))) print("EPOCH is: "+ str(int(len(train_set) / args.batch_size ))) print("Num of EPOCH is: "+ str(args.numEpoch)) def count_network_parameters(model): parameters = filter(lambda p: p.requires_grad, model.parameters()) N = sum([numpy.prod(p.size()) for p in parameters]) return N print("Num. of model parameters is :" + str(count_network_parameters(model))) if hasattr(model,'flownets'): print("Num. of flow model parameters is :" + str(count_network_parameters(model.flownets))) if hasattr(model,'initScaleNets_occlusion'): print("Num. of initScaleNets_occlusion model parameters is :" + str(count_network_parameters(model.initScaleNets_occlusion) + count_network_parameters(model.initScaleNets_occlusion1) + count_network_parameters(model.initScaleNets_occlusion2))) if hasattr(model,'initScaleNets_filter'): print("Num. of initScaleNets_filter model parameters is :" + str(count_network_parameters(model.initScaleNets_filter) + count_network_parameters(model.initScaleNets_filter1) + count_network_parameters(model.initScaleNets_filter2))) if hasattr(model, 'ctxNet'): print("Num. of ctxNet model parameters is :" + str(count_network_parameters(model.ctxNet))) if hasattr(model, 'depthNet'): print("Num. of depthNet model parameters is :" + str(count_network_parameters(model.depthNet))) if hasattr(model,'rectifyNet'): print("Num. of rectifyNet model parameters is :" + str(count_network_parameters(model.rectifyNet))) training_losses = AverageMeter() auxiliary_data = [] saved_total_loss = 10e10 saved_total_PSNR = -1 ikk = 0 for kk in optimizer.param_groups: if kk['lr'] > 0: ikk = kk break for t in range(args.numEpoch): print("The id of this in-training network is " + str(args.uid)) print(args) #Turn into training mode model = model.train() for i, (X0_half,X1_half, y_half) in enumerate(train_loader): if i >= int(len(train_set) / args.batch_size ): #(0 if t == 0 else EPOCH):# break X0_half = X0_half.cuda() if args.use_cuda else X0_half X1_half = X1_half.cuda() if args.use_cuda else X1_half y_half = y_half.cuda() if args.use_cuda else y_half X0 = Variable(X0_half, requires_grad= False) X1 = Variable(X1_half, requires_grad= False) y = Variable(y_half,requires_grad= False) diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0)) pixel_loss, offset_loss, sym_loss = part_loss(diffs,offsets,occlusions, [X0,X1],epsilon=args.epsilon) total_loss = sum(x*y if x > 0 else 0 for x,y in zip(args.alpha, pixel_loss)) training_losses.update(total_loss.item(), args.batch_size) if i % max(1, int(int(len(train_set) / args.batch_size )/500.0)) == 0: print("Ep [" + str(t) +"/" + str(i) + "]\tl.r.: " + str(round(float(ikk['lr']),7))+ "\tPix: " + str([round(x.item(),5) for x in pixel_loss]) + "\tTV: " + str([round(x.item(),4) for x in offset_loss]) + "\tSym: " + str([round(x.item(), 4) for x in sym_loss]) + "\tTotal: " + str([round(x.item(),5) for x in [total_loss]]) + "\tAvg. Loss: " + str([round(training_losses.avg, 5)])) optimizer.zero_grad() total_loss.backward() optimizer.step() if t == 1: # delete the pre validation weights for cleaner workspace if os.path.exists(args.save_path + "/epoch" + str(0) +".pth" ): os.remove(args.save_path + "/epoch" + str(0) +".pth") if os.path.exists(args.save_path + "/epoch" + str(t-1) +".pth"): os.remove(args.save_path + "/epoch" + str(t-1) +".pth") torch.save(model.state_dict(), args.save_path + "/epoch" + str(t) +".pth") # print("\t\t**************Start Validation*****************") #Turn into evaluation mode val_total_losses = AverageMeter() val_total_pixel_loss = AverageMeter() val_total_PSNR_loss = AverageMeter() val_total_tv_loss = AverageMeter() val_total_pws_loss = AverageMeter() val_total_sym_loss = AverageMeter() for i, (X0,X1,y) in enumerate(val_loader): if i >= int(len(test_set)/ args.batch_size): break with torch.no_grad(): X0 = X0.cuda() if args.use_cuda else X0 X1 = X1.cuda() if args.use_cuda else X1 y = y.cuda() if args.use_cuda else y diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0)) pixel_loss, offset_loss,sym_loss = part_loss(diffs, offsets, occlusions, [X0,X1],epsilon=args.epsilon) val_total_loss = sum(x * y for x, y in zip(args.alpha, pixel_loss)) per_sample_pix_error = torch.mean(torch.mean(torch.mean(diffs[args.save_which] ** 2, dim=1),dim=1),dim=1) per_sample_pix_error = per_sample_pix_error.data # extract tensor psnr_loss = torch.mean(20 * torch.log(1.0/torch.sqrt(per_sample_pix_error)))/torch.log(torch.Tensor([10])) # val_total_losses.update(val_total_loss.item(),args.batch_size) val_total_pixel_loss.update(pixel_loss[args.save_which].item(), args.batch_size) val_total_tv_loss.update(offset_loss[0].item(), args.batch_size) val_total_sym_loss.update(sym_loss[0].item(), args.batch_size) val_total_PSNR_loss.update(psnr_loss[0],args.batch_size) print(".",end='',flush=True) print("\nEpoch " + str(int(t)) + "\tlearning rate: " + str(float(ikk['lr'])) + "\tAvg Training Loss: " + str(round(training_losses.avg,5)) + "\tValidate Loss: " + str([round(float(val_total_losses.avg), 5)]) + "\tValidate PSNR: " + str([round(float(val_total_PSNR_loss.avg), 5)]) + "\tPixel Loss: " + str([round(float(val_total_pixel_loss.avg), 5)]) + "\tTV Loss: " + str([round(float(val_total_tv_loss.avg), 4)]) + "\tPWS Loss: " + str([round(float(val_total_pws_loss.avg), 4)]) + "\tSym Loss: " + str([round(float(val_total_sym_loss.avg), 4)]) ) auxiliary_data.append([t, float(ikk['lr']), training_losses.avg, val_total_losses.avg, val_total_pixel_loss.avg, val_total_tv_loss.avg,val_total_pws_loss.avg,val_total_sym_loss.avg]) numpy.savetxt(args.log, numpy.array(auxiliary_data), fmt='%.8f', delimiter=',') training_losses.reset() print("\t\tFinished an epoch, Check and Save the model weights") # we check the validation loss instead of training loss. OK~ if saved_total_loss >= val_total_losses.avg: saved_total_loss = val_total_losses.avg torch.save(model.state_dict(), args.save_path + "/best"+".pth") print("\t\tBest Weights updated for decreased validation loss\n") else: print("\t\tWeights Not updated for undecreased validation loss\n") #schdule the learning rate scheduler.step(val_total_losses.avg) print("*********Finish Training********") if __name__ == '__main__': sys.setrecursionlimit(100000)# 0xC00000FD exception for the recursive detach of gradients. threading.stack_size(200000000)# 0xC00000FD exception for the recursive detach of gradients. thread = threading.Thread(target=train) thread.start() thread.join() exit(0)