Repository: baowenbo/DAIN
Branch: master
Commit: 7c727aca5676
Files: 123
Total size: 560.1 KB
Directory structure:
gitextract_7t87l58_/
├── .gitignore
├── AverageMeter.py
├── Colab_DAIN.ipynb
├── LICENSE
├── MegaDepth/
│ ├── LICENSE
│ ├── MegaDepth_model.py
│ ├── README.md
│ ├── SDR_compute.py
│ ├── __init__.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── aligned_data_loader.py
│ │ ├── base_data_loader.py
│ │ ├── data_loader.py
│ │ └── image_folder.py
│ ├── models/
│ │ ├── HG_model.py
│ │ ├── __init__.py
│ │ ├── base_model.py
│ │ └── models.py
│ ├── options/
│ │ ├── __init__.py
│ │ ├── base_options.py
│ │ ├── test_options.py
│ │ └── train_options.py
│ ├── pytorch_DIW_scratch.py
│ ├── rmse_error_main.py
│ └── util/
│ ├── __init__.py
│ ├── html.py
│ ├── image_pool.py
│ ├── png.py
│ ├── util.py
│ └── visualizer.py
├── PWCNet/
│ ├── PWCNet.py
│ ├── __init__.py
│ ├── correlation_package_pytorch1_0/
│ │ ├── __init__.py
│ │ ├── build.sh
│ │ ├── clean.sh
│ │ ├── correlation.py
│ │ ├── correlation_cuda.cc
│ │ ├── correlation_cuda_kernel.cu
│ │ ├── correlation_cuda_kernel.cuh
│ │ └── setup.py
│ └── models/
│ ├── PWCNet.py
│ └── __init__.py
├── README.md
├── Resblock/
│ ├── BasicBlock.py
│ └── __init__.py
├── S2D_models/
│ ├── S2DF.py
│ └── __init__.py
├── Stack.py
├── balancedsampler.py
├── colab_interpolate.py
├── datasets/
│ ├── Vimeo_90K_interp.py
│ ├── __init__.py
│ └── listdatasets.py
├── demo_MiddleBury.py
├── demo_MiddleBury_slowmotion.py
├── environment.yaml
├── loss_function.py
├── lr_scheduler.py
├── my_args.py
├── my_package/
│ ├── DepthFlowProjection/
│ │ ├── DepthFlowProjectionLayer.py
│ │ ├── DepthFlowProjectionModule.py
│ │ ├── __init__.py
│ │ ├── depthflowprojection_cuda.cc
│ │ ├── depthflowprojection_cuda_kernel.cu
│ │ ├── depthflowprojection_cuda_kernel.cuh
│ │ └── setup.py
│ ├── FilterInterpolation/
│ │ ├── FilterInterpolationLayer.py
│ │ ├── FilterInterpolationModule.py
│ │ ├── __init__.py
│ │ ├── filterinterpolation_cuda.cc
│ │ ├── filterinterpolation_cuda_kernel.cu
│ │ ├── filterinterpolation_cuda_kernel.cuh
│ │ └── setup.py
│ ├── FlowProjection/
│ │ ├── FlowProjectionLayer.py
│ │ ├── FlowProjectionModule.py
│ │ ├── __init__.py
│ │ ├── flowprojection_cuda.cc
│ │ ├── flowprojection_cuda_kernel.cu
│ │ ├── flowprojection_cuda_kernel.cuh
│ │ └── setup.py
│ ├── Interpolation/
│ │ ├── InterpolationLayer.py
│ │ ├── InterpolationModule.py
│ │ ├── __init__.py
│ │ ├── interpolation_cuda.cc
│ │ ├── interpolation_cuda_kernel.cu
│ │ ├── interpolation_cuda_kernel.cuh
│ │ └── setup.py
│ ├── InterpolationCh/
│ │ ├── InterpolationChLayer.py
│ │ ├── InterpolationChModule.py
│ │ ├── __init__.py
│ │ ├── interpolationch_cuda.cc
│ │ ├── interpolationch_cuda_kernel.cu
│ │ ├── interpolationch_cuda_kernel.cuh
│ │ └── setup.py
│ ├── MinDepthFlowProjection/
│ │ ├── __init__.py
│ │ ├── minDepthFlowProjectionLayer.py
│ │ ├── minDepthFlowProjectionModule.py
│ │ ├── mindepthflowprojection_cuda.cc
│ │ ├── mindepthflowprojection_cuda_kernel.cu
│ │ ├── mindepthflowprojection_cuda_kernel.cuh
│ │ └── setup.py
│ ├── SeparableConv/
│ │ ├── SeparableConvLayer.py
│ │ ├── SeparableConvModule.py
│ │ ├── __init__.py
│ │ ├── separableconv_cuda.cc
│ │ ├── separableconv_cuda_kernel.cu
│ │ ├── separableconv_cuda_kernel.cuh
│ │ └── setup.py
│ ├── SeparableConvFlow/
│ │ ├── SeparableConvFlowLayer.py
│ │ ├── SeparableConvFlowModule.py
│ │ ├── __init__.py
│ │ ├── separableconvflow_cuda.cc
│ │ ├── separableconvflow_cuda_kernel.cu
│ │ ├── separableconvflow_cuda_kernel.cuh
│ │ └── setup.py
│ ├── build.sh
│ ├── clean.sh
│ ├── compiler_args.py
│ └── test_module.py
├── networks/
│ ├── DAIN.py
│ ├── DAIN_slowmotion.py
│ └── __init__.py
└── train.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Ignore Git here
.git
# But not these files...
# !.gitignore
checkpoints/test_local/opt.txt
PWCNet/pwc_net.pth.tar
MegaDepth/checkpoints/*
model_weights/*
MiddleBurySet/*
.nfs*
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff:
.idea/workspace.xml
.idea/tasks.xml
.idea/dictionaries
.idea/vcs.xml
.idea/jsLibraryMappings.xml
# Sensitive or high-churn files:
.idea/dataSources.ids
.idea/dataSources.xml
.idea/dataSources.local.xml
.idea/sqlDataSources.xml
.idea/dynamic.xml
.idea/uiDesigner.xml
# Gradle:
.idea/gradle.xml
.idea/libraries
# Mongo Explorer plugin:
.idea/mongoSettings.xml
.idea/
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
================================================
FILE: AverageMeter.py
================================================
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
================================================
FILE: Colab_DAIN.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Colab_DAIN_new.ipynb",
"private_outputs": true,
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "1pIo4r_Y8cMo"
},
"source": [
"# DAIN Colab"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iGPHW5SOpPe3"
},
"source": [
"*DAIN Colab, v1.6.0*\n",
"\n",
"Based on the [original Colab file](https://github.com/baowenbo/DAIN/issues/44) by btahir. \n",
"\n",
"Enhancements by [Styler00Dollar](https://github.com/styler00dollar) aka \"sudo rm -rf / --no-preserve-root#8353\" on discord and [Alpha](https://github.com/AlphaGit), (Alpha#6137 on Discord). Please do not run this command in your linux terminal. It's rather meant as a joke.\n",
"\n",
"[Styler00Dollar's fork](https://github.com/styler00dollar/DAIN) / [Alpha's fork](https://github.com/AlphaGit/DAIN)\n",
"\n",
"A simple guide:\n",
"- Upload this ` .ipynb` file to your Google Colab.\n",
"- Create a folder inside of Google Drive named \"DAIN\"\n",
"- Change the configurations in the next cell\n",
"- Run cells one by one\n",
"\n",
"Stuff that should be improved:\n",
"- Alpha channel will be removed automatically and won't be added back. Anything related to alpha will be converted to black.\n",
"- Adding configuration to select speed\n",
"- Detect scenes to avoid interpolating scene-changes\n",
"- Auto-resume\n",
"- Copy `start_frame` - `end_frame` audio from original input to final output\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "enKoi0TR2fOD",
"cellView": "form"
},
"source": [
"################# Required Configurations ############################\n",
"\n",
"#@markdown # Required Configuration\n",
"#@markdown Use the values in here to configure what you'd like DAIN to do.\n",
"\n",
"#@markdown ## Input file\n",
"#@markdown Path (relative to the root of your Google Drive) to the input file. For instance, if you save your `example.mkv` file in your Google Drive, inside a `videos` folder, the path would be: `videos/example.mkv`. Currenly videos and gifs are supported.\n",
"INPUT_FILEPATH = \"DAIN/input.mp4\" #@param{type:\"string\"}\n",
"\n",
"#@markdown ## Output file\n",
"#@markdown Output file path: path (relative to the root of your Google Drive) for the output file. It will also determine the filetype in the destination. `.mp4` is recommended for video input, `.gif` for gif inputs.\n",
"OUTPUT_FILE_PATH = \"DAIN/output.mp4\" #@param{type:\"string\"}\n",
"\n",
"################# Optional configurations ############################\n",
"\n",
"#@markdown # Optional Configuration\n",
"#@markdown Parameters below can be left with their defaults, but feel free to adapt them to your needs.\n",
"\n",
"#@markdown ## Target FPS\n",
"#@markdown how many frames per second should the result have. This will determine how many intermediate images are interpolated.\n",
"TARGET_FPS = 60 #@param{type:\"number\"}\n",
"\n",
"#@markdown ## Frame input directory\n",
"#@markdown A path, relative to your GDrive root, where you already have the list of frames in the format 00001.png, 00002.png, etc.\n",
"FRAME_INPUT_DIR = '/content/DAIN/input_frames' #@param{type:\"string\"}\n",
"\n",
"#@markdown ## Frame output directory\n",
"#@markdown A path, relative to your GDrive root, where you want the generated frame.\n",
"FRAME_OUTPUT_DIR = '/content/DAIN/output_frames' #@param{type:\"string\"}\n",
"\n",
"#@markdown ## Start Frame\n",
"#@markdown First frame to consider from the video when processing.\n",
"START_FRAME = 1 #@param{type:\"number\"}\n",
"\n",
"#@markdown ## End Frame\n",
"#@markdown Last frame to consider from the video when processing. To use the whole video use `-1`.\n",
"END_FRAME = -1 #@param{type:\"number\"}\n",
"\n",
"#@markdown ## Seamless playback\n",
"#@markdown Creates a seamless loop by using the first frame as last one as well. Set this to True this if loop is intended.\n",
"SEAMLESS = False #@param{type:\"boolean\"}\n",
"\n",
"#@markdown ## Auto-remove PNG directory\n",
"#@markdown Auto-delete output PNG dir after ffmpeg video creation. Set this to `False` if you want to keep the PNG files.\n",
"AUTO_REMOVE = True #@param{type:\"boolean\"}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "N9cGwalNeyk9",
"cellView": "form"
},
"source": [
"#@title Connect Google Drive\n",
"from google.colab import drive\n",
"drive.mount('/content/gdrive')\n",
"print('Google Drive connected.')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "irzjv1x4e3S4",
"cellView": "form"
},
"source": [
"#@title Check your current GPU\n",
"# If you are lucky, you get 16GB VRAM. If you are not lucky, you get less. VRAM is important. The more VRAM, the higher the maximum resolution will go.\n",
"\n",
"# 16GB: Can handle 720p. 1080p will procude an out-of-memory error. \n",
"# 8GB: Can handle 480p. 720p will produce an out-of-memory error.\n",
"\n",
"!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "UYHTTP91oMvh"
},
"source": [
"# Install dependencies.\n",
"\n",
"This next step may take somewhere between 15-20 minutes. Run this only once at startup.\n",
"\n",
"Look for the \"Finished installing dependencies\" message."
]
},
{
"cell_type": "code",
"metadata": {
"id": "e5AHGetTRacZ",
"cellView": "form"
},
"source": [
"#@title Setup everything. This takes a while. Just wait ~20 minutes in total.\n",
"\n",
"# Install old pytorch to avoid faulty output\n",
"%cd /content/\n",
"!wget -c https://repo.anaconda.com/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n",
"!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh\n",
"!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local\n",
"!conda install pytorch==1.1 cudatoolkit torchvision -c pytorch -y\n",
"!conda install ipykernel -y\n",
"\n",
"!pip install scipy==1.1.0\n",
"!pip install imageio\n",
"!CUDA_VISIBLE_DEVICES=0\n",
"!sudo apt-get install imagemagick imagemagick-doc\n",
"print(\"Finished installing dependencies.\")\n",
"\n",
"# Clone DAIN sources\n",
"%cd /content\n",
"!git clone -b master --depth 1 https://github.com/baowenbo/DAIN /content/DAIN\n",
"%cd /content/DAIN\n",
"!git log -1\n",
"\n",
"# Building DAIN\n",
"%cd /content/DAIN/my_package/\n",
"!./build.sh\n",
"print(\"Building #1 done.\")\n",
"\n",
"# Building DAIN PyTorch correlation package.\n",
"%cd /content/DAIN/PWCNet/correlation_package_pytorch1_0\n",
"!./build.sh\n",
"print(\"Building #2 done.\")\n",
"\n",
"# Downloading pre-trained model\n",
"%cd /content/DAIN\n",
"!mkdir model_weights\n",
"!wget -O model_weights/best.pth http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zm5kn6vTncL4",
"cellView": "form"
},
"source": [
"#@title Detecting FPS of input file.\n",
"%shell yes | cp -f /content/gdrive/My\\ Drive/{INPUT_FILEPATH} /content/DAIN/\n",
"\n",
"import os\n",
"filename = os.path.basename(INPUT_FILEPATH)\n",
"\n",
"import cv2\n",
"cap = cv2.VideoCapture(f'/content/DAIN/{filename}')\n",
"\n",
"fps = cap.get(cv2.CAP_PROP_FPS)\n",
"print(f\"Input file has {fps} fps\")\n",
"\n",
"if(fps/TARGET_FPS>0.5):\n",
" print(\"Define a higher fps, because there is not enough time for new frames. (Old FPS)/(New FPS) should be lower than 0.5. Interpolation will fail if you try.\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9YNva-GuKq4Y",
"cellView": "form"
},
"source": [
"#@title ffmpeg extract - Generating individual frame PNGs from the source file.\n",
"%shell rm -rf '{FRAME_INPUT_DIR}'\n",
"%shell mkdir -p '{FRAME_INPUT_DIR}'\n",
"\n",
"if (END_FRAME==-1):\n",
" %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=gte(n\\,{START_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n",
"else:\n",
" %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=between(n\\,{START_FRAME}\\,{END_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n",
"\n",
"from IPython.display import clear_output\n",
"clear_output()\n",
"\n",
"png_generated_count_command_result = %shell ls '{FRAME_INPUT_DIR}' | wc -l\n",
"frame_count = int(png_generated_count_command_result.output.strip())\n",
"\n",
"import shutil\n",
"if SEAMLESS:\n",
" frame_count += 1\n",
" first_frame = f\"{FRAME_INPUT_DIR}/00001.png\"\n",
" new_last_frame = f\"{FRAME_INPUT_DIR}/{frame_count.zfill(5)}.png\"\n",
" shutil.copyfile(first_frame, new_last_frame)\n",
"\n",
"print(f\"{frame_count} frame PNGs generated.\")\n",
"\n",
"#Checking if PNGs do have alpha\n",
"import subprocess as sp\n",
"%cd {FRAME_INPUT_DIR}\n",
"channels = sp.getoutput('identify -format %[channels] 00001.png')\n",
"print (f\"{channels} detected\")\n",
"\n",
"# Removing alpha if detected\n",
"if \"a\" in channels:\n",
" print(\"Alpha channel detected and will be removed.\")\n",
" print(sp.getoutput('find . -name \"*.png\" -exec convert \"{}\" -alpha off PNG24:\"{}\" \\;'))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "W3rrE7L824gL",
"cellView": "form"
},
"source": [
"#@title Interpolation\n",
"%shell mkdir -p '{FRAME_OUTPUT_DIR}'\n",
"%cd /content/DAIN\n",
"\n",
"!python -W ignore colab_interpolate.py --netName DAIN_slowmotion --time_step {fps/TARGET_FPS} --start_frame 1 --end_frame {frame_count} --frame_input_dir '{FRAME_INPUT_DIR}' --frame_output_dir '{FRAME_OUTPUT_DIR}'"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TKREDli2IDMV",
"cellView": "form"
},
"source": [
"#@title Create output video\n",
"%cd {FRAME_OUTPUT_DIR}\n",
"%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n",
"\n",
"if(AUTO_REMOVE):\n",
" !rm -rf {FRAME_OUTPUT_DIR}/*\n",
"\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "UF5TEo5N374o",
"cellView": "form"
},
"source": [
"#@title [Experimental] Create video with sound\n",
"# Only run this, if the original had sound.\n",
"%cd {FRAME_OUTPUT_DIR}\n",
"%shell ffmpeg -i '/content/DAIN/{filename}' -acodec copy output-audio.aac\n",
"%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' -i output-audio.aac -shortest '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n",
"\n",
"if (AUTO_REMOVE):\n",
" !rm -rf {FRAME_OUTPUT_DIR}/*\n",
" !rm -rf output-audio.aac"
],
"execution_count": null,
"outputs": []
}
]
}
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 Wenbo Bao
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MegaDepth/LICENSE
================================================
MIT License
Copyright (c) 2018 Zhengqi Li
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MegaDepth/MegaDepth_model.py
================================================
import torch
import sys
from torch.autograd import Variable
import numpy as np
from .options.train_options import TrainOptions
from .models.models import create_model
__all__ = ['HourGlass']
def HourGlass(pretrained=None):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch
model = create_model(opt,pretrained)
#netG is the real nn.Module
return model.netG
================================================
FILE: MegaDepth/README.md
================================================
# MegaDepth: Learning Single-View Depth Prediction from Internet Photos
This is a code of the algorithm described in "MegaDepth: Learning Single-View Depth Prediction from Internet Photos, Z. Li and N. Snavely, CVPR 2018". The code skeleton is based on "https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix". If you use our code or models for academic purposes, please consider citing:
@inproceedings{MDLi18,
title={MegaDepth: Learning Single-View Depth Prediction from Internet Photos},
author={Zhengqi Li and Noah Snavely},
booktitle={Computer Vision and Pattern Recognition (CVPR)},
year={2018}
}
#### Examples of single-view depth predictions on the photos we randomly downloaded from Internet:
#### Dependencies:
* The code was written in Pytorch 0.2 and Python 2.7, but it should be easy to adapt it to Python 3 and latest Pytorch version if needed.
* You might need skimage, h5py libraries installed for python before running the code.
#### Single-view depth prediction on any Internet photo:
* Download pretrained model from: http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth and put it in "checkpoints/test_local/best_generalization_net_G.pth
* In python file "models/HG_model.py", in init function, change to "model_parameters = self.load_network(model, 'G', 'best_generalization')"
* run demo code
```bash
python demo.py
```
You should see an inverse depth prediction saved as demo.png from an original photo demo.jpg. If you want to use RGB maps for visualization, like the figures in our paper, you have to install/run semantic segmentation from https://github.com/kazuto1011/pspnet-pytorch trained on ADE20K to mask out sky, because inconsistent depth prediction of unmasked sky will not make RGB visualization resonable.
#### Evaluation on the MegaDepth test splits:
* Download MegaDepth V1 dataset from project website: http://www.cs.cornell.edu/projects/megadepth/.
* Download pretrained model (specific for MD dataset) from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_vanila_net_G.pth and put it in "checkpoints/test_local/best_vanila_net_G.pth"
* Download test list files from http://www.cs.cornell.edu/projects/megadepth/dataset/data_lists/test_lists.tar.gz, it should include two folders corresponding to images with landscape and portrait orientations.
* To compute scale invarance RMSE on MD testset, change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run:
```bash
python rmse_error_main.py
```
* To compute Structure from Motion Disagreement Rate (SDR), change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run:
```bash
python SDR_compute.py
```
* If you want to run our model on arbitrary Internet photos, please download pretrained model from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth, which has much better generalization ability (qualitatively speaking) to completely unknown scenes.
================================================
FILE: MegaDepth/SDR_compute.py
================================================
import time
import torch
import sys
from options.train_options import TrainOptions
opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch
from data.data_loader import CreateDataLoader_TEST
from models.models import create_model
dataset_root = "/phoenix/S6/zl548/"
test_list_dir_l = dataset_root + '/MegaDpeth_code/test_list/landscape/'
input_height = 240
input_width = 320
test_data_loader_l = CreateDataLoader_TEST(dataset_root, test_list_dir_l, input_height, input_width)
test_dataset_l = test_data_loader_l.load_data()
test_dataset_size_l = len(test_data_loader_l)
print('========================= test L images = %d' % test_dataset_size_l)
test_list_dir_p = dataset_root + '/MegaDpeth_code/test_list/portrait/'
input_height = 320
input_width = 240
test_data_loader_p = CreateDataLoader_TEST(dataset_root, test_list_dir_p, input_height, input_width)
test_dataset_p = test_data_loader_p.load_data()
test_dataset_size_p = len(test_data_loader_p)
print('========================= test P images = %d' % test_dataset_size_p)
model = create_model(opt)
batch_size = 32
diw_index = 0
total_steps = 0
best_loss = 100
error_list = [0 , 0, 0]
total_list = [0 , 0, 0]
list_l = range(test_dataset_size_l)
list_p = range(test_dataset_size_p)
def test_SDR(model):
total_loss =0
# count = 0
print("============================= TEST SDR============================")
model.switch_to_eval()
diw_index = 0
for i, data in enumerate(test_dataset_l):
stacked_img = data['img_1']
targets = data['target_1']
error, samples = model.evaluate_SDR(stacked_img, targets)
for j in range(0,3):
error_list[j] += error[j]
total_list[j] += samples[j]
print("EQUAL ", error_list[0]/float(total_list[0]))
print("INEQUAL ", error_list[1]/float(total_list[1]))
print("TOTAL ",error_list[2]/float(total_list[2]))
for i, data in enumerate(test_dataset_p):
stacked_img = data['img_1']
targets = data['target_1']
error, samples = model.evaluate_SDR(stacked_img, targets)
for j in range(0,3):
error_list[j] += error[j]
total_list[j] += samples[j]
print("EQUAL ", error_list[0]/float(total_list[0]))
print("INEQUAL ", error_list[1]/float(total_list[1]))
print("TOTAL ",error_list[2]/float(total_list[2]))
print("=========================================================SDR Summary =====================")
print("Equal SDR:\t" , float(error_list[0])/ float(total_list[0]))
print("Unequal SDR:\t" , float(error_list[1])/ float(total_list[1]))
print("SDR:\t" , float(error_list[2])/ float(total_list[2]))
print("WE ARE TESTING SDR!!!!")
test_SDR(model)
================================================
FILE: MegaDepth/__init__.py
================================================
from .MegaDepth_model import *
================================================
FILE: MegaDepth/data/__init__.py
================================================
================================================
FILE: MegaDepth/data/aligned_data_loader.py
================================================
import random
import numpy as np
import torch.utils.data
from data.base_data_loader import BaseDataLoader
from data.image_folder import ImageFolder
from data.image_folder import ImageFolder_TEST
from builtins import object
import sys
import h5py
class PairedData(object):
def __init__(self, data_loader, flip):
self.data_loader = data_loader
# self.fineSize = fineSize
# self.max_dataset_size = max_dataset_size
self.flip = flip
self.data_loader_iter = iter(self.data_loader)
self.iter = 0
def __iter__(self):
self.data_loader_iter = iter(self.data_loader)
self.iter = 0
return self
def __next__(self):
self.iter += 1
final_img, target_1 = next(self.data_loader_iter)
return {'img_1': final_img, 'target_1': target_1}
class AlignedDataLoader(BaseDataLoader):
def __init__(self,_root, _list_dir, _input_height, _input_width, _is_flip, _shuffle):
transform = None
dataset = ImageFolder(root=_root, \
list_dir =_list_dir, input_height = _input_height, input_width = _input_width, transform=transform, is_flip = _is_flip)
data_loader = torch.utils.data.DataLoader(dataset, batch_size= 16, shuffle= _shuffle, num_workers=int(3))
self.dataset = dataset
flip = False
self.paired_data = PairedData(data_loader, flip)
def name(self):
return 'RMSEDataLoader'
def load_data(self):
return self.paired_data
def __len__(self):
return len(self.dataset)
class AlignedDataLoader_TEST(BaseDataLoader):
def __init__(self,_root, _list_dir, _input_height, _input_width):
dataset = ImageFolder_TEST(root=_root, \
list_dir =_list_dir, _input_height = _input_height, _input_width = _input_width)
data_loader = torch.utils.data.DataLoader(dataset, batch_size= 1, shuffle= False, num_workers=int(3))
self.dataset = dataset
flip = False
self.paired_data = PairedData(data_loader, flip)
def name(self):
return 'TestSDRDataLoader'
def load_data(self):
return self.paired_data
def __len__(self):
return len(self.dataset)
================================================
FILE: MegaDepth/data/base_data_loader.py
================================================
class BaseDataLoader():
def __init__(self):
pass
# def initialize(self):
# # self.opt = opt
# pass
def load_data():
return None
================================================
FILE: MegaDepth/data/data_loader.py
================================================
def CreateDataLoader(_root, _list_dir, _input_height, _input_width, is_flip = True, shuffle = True):
data_loader = None
from data.aligned_data_loader import AlignedDataLoader
data_loader = AlignedDataLoader(_root, _list_dir, _input_height, _input_width, is_flip, shuffle)
return data_loader
def CreateDataLoader_TEST(_root, _list_dir, _input_height, _input_width):
data_loader = None
from data.aligned_data_loader import AlignedDataLoader_TEST
data_loader = AlignedDataLoader_TEST(_root, _list_dir, _input_height, _input_width)
return data_loader
================================================
FILE: MegaDepth/data/image_folder.py
================================================
################################################################################
# Code from
# https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py
# Modified the original code so that it also loads images from the current
# directory as well as the subdirectories
################################################################################
import h5py
import torch.utils.data as data
import pickle
import numpy as np
import torch
import os, os.path
import math, random
import sys
from skimage.transform import resize
from skimage import io
def make_dataset(list_dir):
# subgroup_name1 = "/dataset/image_list/"
file_name = list_dir + "imgs_MD.p"
file_name_1 = open( file_name, "rb" )
images_list = pickle.load( file_name_1)
file_name_1.close()
file_name_t= list_dir + "targets_MD.p"
file_name_2 = open( file_name_t, "rb" )
targets_list = pickle.load(file_name_2)
file_name_2.close()
return images_list, targets_list
# test for si-RMSE
class ImageFolder(data.Dataset):
def __init__(self, root, list_dir, input_height, input_width, transform=None,
loader=None, is_flip = True):
# load image list from hdf5
img_list , targets_list = make_dataset(list_dir)
if len(img_list) == 0:
raise(RuntimeError("Found 0 images in: " + root + "\n"
"Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
# img_list_1, img_list_2 = selfshuffle_dataset(img_list)
self.root = root
self.list_dir = list_dir
self.img_list = img_list
self.targets_list = targets_list
self.transform = transform
# self.loader = loader
self.input_height = input_height
self.input_width = input_width
self.is_flip = is_flip
def load_MD(self, img_path, depth_path):
MD_img = np.float32(io.imread(img_path))/255.0
hdf5_file_read = h5py.File(depth_path,'r')
gt = hdf5_file_read.get('/depth')
gt = np.array(gt)
assert(gt.shape[0] == MD_img.shape[0])
assert(gt.shape[1] == MD_img.shape[1])
color_rgb = np.zeros((self.input_height,self.input_width,3))
MD_img = resize(MD_img, (self.input_height, self.input_width), order = 1)
if len(MD_img.shape) == 2:
color_rgb[:,:,0] = MD_img.copy()
color_rgb[:,:,1] = MD_img.copy()
color_rgb[:,:,2] = MD_img.copy()
else:
color_rgb = MD_img.copy()
if np.sum(gt > 1e-8) > 10:
gt[ gt > np.percentile(gt[gt > 1e-8], 98)] = 0
gt[ gt < np.percentile(gt[gt > 1e-8], 1)] = 0
max_depth = np.max(gt) + 1e-9
gt = gt/max_depth
gt = resize(gt, (self.input_height, self.input_width), order = 0)
gt = gt*max_depth
mask = np.float32(gt > 1e-8)
color_rgb = np.ascontiguousarray(color_rgb)
gt = np.ascontiguousarray(gt)
mask = np.ascontiguousarray(mask)
hdf5_file_read.close()
return color_rgb, gt, mask
def __getitem__(self, index):
# 00xx/1/
targets_1 = {}
# targets_1['L'] = []
targets_1['path'] = []
img_path_suff = self.img_list[index]
targets_path_suff = self.targets_list[index]
img_path = self.root + "/MegaDepth_v1/" + img_path_suff
depth_path = self.root + "/MegaDepth_v1/" + targets_path_suff
img, gt, mask = self.load_MD(img_path, depth_path)
gt[mask < 0.1] = 1.0
targets_1['path'] = targets_path_suff
targets_1['gt_0'] = torch.from_numpy(gt).float()
targets_1['mask_0'] = torch.from_numpy(mask).float()
final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()
return final_img, targets_1
def __len__(self):
return len(self.img_list)
# Test for SDR
class ImageFolder_TEST(data.Dataset):
def __init__(self, root, list_dir, _input_height, _input_width):
# load image list from hdf5
img_list , targets_list = make_dataset(list_dir)
if len(img_list) == 0:
raise(RuntimeError("Found 0 images in: " + root + "\n"
"Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
self.root = root
self.list_dir = list_dir
self.img_list = img_list
self.input_height = _input_height
self.input_width = _input_width
self.half_window = 1
def load_SfM_ORD(self, img_path, targets_path):
sfm_image = np.float32(io.imread(img_path))/255.0
resized_sfm_img = resize(sfm_image, (self.input_height, self.input_width), order = 1)
color_rgb = np.zeros((self.input_height, self.input_width,3))
if len(sfm_image.shape) == 2:
color_rgb[:,:,0] = resized_sfm_img.copy()
color_rgb[:,:,1] = resized_sfm_img.copy()
color_rgb[:,:,2] = resized_sfm_img.copy()
else:
color_rgb = resized_sfm_img.copy()
if color_rgb.shape[2] == 4:
return color_rgb, 0, 0 ,0, 0, 0
hdf5_file_read = h5py.File(targets_path,'r')
gt = hdf5_file_read.get('/SfM_features')
gt = np.array(gt)
y_A = np.round( gt[0,:] * float(self.input_height) )
x_A = np.round( gt[1,:] * float(self.input_width) )
y_B = np.round( gt[2,:] * float(self.input_height) )
x_B = np.round( gt[3,:] * float(self.input_width) )
ord_ = gt[4,:]
hdf5_file_read.close()
return color_rgb, y_A, x_A ,y_B, x_B, ord_
def __getitem__(self, index):
# 00xx/1/
targets_1 = {}
# targets_1['L'] = []
targets_1['path'] = []
targets_1['sdr_xA'] = []
targets_1['sdr_yA'] = []
targets_1['sdr_xB'] = []
targets_1['sdr_yB'] = []
targets_1['sdr_gt'] = []
img_path_suff = self.img_list[index]
img_path = self.root + "/MegaDepth_v1/" + img_path_suff
folder_name = img_path_suff.split('/')[-4]
img_name = img_path_suff.split('/')[-1]
sparse_sift_path = self.root + "/sparse_features/" + folder_name + "/" + img_name + ".h5"
# no sift features
if not os.path.isfile(sparse_sift_path) or not os.path.isfile(img_path):
img = np.zeros((self.input_height, self.input_width,3))
targets_1['has_SfM_feature'] = False
else:
img, y_A, x_A ,y_B, x_B, ordinal = self.load_SfM_ORD(img_path, sparse_sift_path)
targets_1['sdr_xA'].append(torch.from_numpy(x_A).long())
targets_1['sdr_yA'].append(torch.from_numpy(y_A).long())
targets_1['sdr_xB'].append(torch.from_numpy(x_B).long())
targets_1['sdr_yB'].append(torch.from_numpy(y_B).long())
targets_1['sdr_gt'].append(torch.from_numpy(ordinal).float())
targets_1['has_SfM_feature'] = True
final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()
return final_img, targets_1
def __len__(self):
return len(self.img_list)
================================================
FILE: MegaDepth/models/HG_model.py
================================================
import numpy as np
import torch
import os
from torch.autograd import Variable
from .base_model import BaseModel
import sys
# import pytorch_DIW_scratch
import MegaDepth.pytorch_DIW_scratch as pytorch_DIW_scratch
class HGModel(BaseModel):
def name(self):
return 'HGModel'
def __init__(self, opt,pretrained=None):
BaseModel.initialize(self, opt)
# print("===========================================LOADING Hourglass NETWORK====================================================")
model = pytorch_DIW_scratch.pytorch_DIW_scratch
# model_temp = model
# model= torch.nn.parallel.DataParallel(model, device_ids = [0,1])
# model_parameters = self.load_network(model, 'G', 'best_vanila')
if pretrained is None:
# model_parameters = self.load_network(model, 'G', 'best_generalization')
#
# model.load_state_dict(model_parameters)
# self.netG = model.cuda()
self.netG = model
# print("No weights loaded for Hourglass Network")
else:
pretrained_dict = torch.load(pretrained)
model_dict = model.state_dict()
# print(len(pretrained_dict))
# print(len(model_dict))
# 1. filter out unnecessary keys
# the saved model contains a 'module.' prefix for the data.parallel reason
pretrained_dict = {k[7:]: v for k, v in pretrained_dict.items()} # and not k[:10]== 'rectifyNet'}
# print(str(len(pretrained_dict)) + " are updated")
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
pretrained_dict = None
self.netG = model
def batch_classify(self, z_A_arr, z_B_arr, ground_truth ):
threashold = 1.1
depth_ratio = torch.div(z_A_arr, z_B_arr)
depth_ratio = depth_ratio.cpu()
estimated_labels = torch.zeros(depth_ratio.size(0))
estimated_labels[depth_ratio > (threashold)] = 1
estimated_labels[depth_ratio < (1/threashold)] = -1
diff = estimated_labels - ground_truth
diff[diff != 0] = 1
# error
inequal_error_count = diff[ground_truth != 0]
inequal_error_count = torch.sum(inequal_error_count)
error_count = torch.sum(diff) #diff[diff !=0]
# error_count = error_count.size(0)
equal_error_count = error_count - inequal_error_count
# total
total_count = depth_ratio.size(0)
ground_truth[ground_truth !=0 ] = 1
inequal_count_total = torch.sum(ground_truth)
equal_total_count = total_count - inequal_count_total
error_list = [equal_error_count, inequal_error_count, error_count]
count_list = [equal_total_count, inequal_count_total, total_count]
return error_list, count_list
def computeSDR(self, prediction_d, targets):
# for each image
total_error = [0,0,0]
total_samples = [0,0,0]
for i in range(0, prediction_d.size(0)):
if targets['has_SfM_feature'][i] == False:
continue
x_A_arr = targets["sdr_xA"][i].squeeze(0)
x_B_arr = targets["sdr_xB"][i].squeeze(0)
y_A_arr = targets["sdr_yA"][i].squeeze(0)
y_B_arr = targets["sdr_yB"][i].squeeze(0)
predict_depth = torch.exp(prediction_d[i,:,:])
predict_depth = predict_depth.squeeze(0)
ground_truth = targets["sdr_gt"][i]
# print(x_A_arr.size())
# print(y_A_arr.size())
z_A_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_A_arr.cuda()) , 0, y_A_arr.view(1, -1).cuda())# predict_depth:index(2, x_A_arr):gather(1, y_A_arr:view(1, -1))
z_B_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_B_arr.cuda()) , 0, y_B_arr.view(1, -1).cuda())
z_A_arr = z_A_arr.squeeze(0)
z_B_arr = z_B_arr.squeeze(0)
error_list, count_list = self.batch_classify(z_A_arr, z_B_arr,ground_truth)
for j in range(0,3):
total_error[j] += error_list[j]
total_samples[j] += count_list[j]
return total_error, total_samples
def evaluate_SDR(self, input_, targets):
input_images = Variable(input_.cuda() )
prediction_d = self.netG.forward(input_images)
total_error, total_samples = self.computeSDR(prediction_d.data, targets)
return total_error, total_samples
def rmse_Loss(self, log_prediction_d, mask, log_gt):
N = torch.sum(mask)
log_d_diff = log_prediction_d - log_gt
log_d_diff = torch.mul(log_d_diff, mask)
s1 = torch.sum( torch.pow(log_d_diff,2) )/N
s2 = torch.pow(torch.sum(log_d_diff),2)/(N*N)
data_loss = s1 - s2
data_loss = torch.sqrt(data_loss)
return data_loss
def evaluate_RMSE(self, input_images, prediction_d, targets):
count = 0
total_loss = Variable(torch.cuda.FloatTensor(1))
total_loss[0] = 0
mask_0 = Variable(targets['mask_0'].cuda(), requires_grad = False)
d_gt_0 = torch.log(Variable(targets['gt_0'].cuda(), requires_grad = False))
for i in range(0, mask_0.size(0)):
total_loss += self.rmse_Loss(prediction_d[i,:,:], mask_0[i,:,:], d_gt_0[i,:,:])
count += 1
return total_loss.data[0], count
def evaluate_sc_inv(self, input_, targets):
input_images = Variable(input_.cuda() )
prediction_d = self.netG.forward(input_images)
rmse_loss , count= self.evaluate_RMSE(input_images, prediction_d, targets)
return rmse_loss, count
def switch_to_train(self):
self.netG.train()
def switch_to_eval(self):
self.netG.eval()
================================================
FILE: MegaDepth/models/__init__.py
================================================
================================================
FILE: MegaDepth/models/base_model.py
================================================
import os
import torch
class BaseModel():
def name(self):
return 'BaseModel'
def initialize(self, opt):
self.opt = opt
self.gpu_ids = opt.gpu_ids
self.isTrain = opt.isTrain
self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor
self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)
def set_input(self, input):
self.input = input
def forward(self):
pass
# used in test time, no backprop
def test(self):
pass
def get_image_paths(self):
pass
def optimize_parameters(self):
pass
def get_current_visuals(self):
return self.input
def get_current_errors(self):
return {}
def save(self, label):
pass
# helper saving function that can be used by subclasses
def save_network(self, network, network_label, epoch_label, gpu_ids):
save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)
save_path = os.path.join(self.save_dir, save_filename)
torch.save(network.cpu().state_dict(), save_path)
if len(gpu_ids) and torch.cuda.is_available():
network.cuda(device_id=gpu_ids[0])
# helper loading function that can be used by subclasses
def load_network(self, network, network_label, epoch_label):
save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
save_path = os.path.join(self.save_dir, save_filename)
print(save_path)
model = torch.load(save_path)
return model
# network.load_state_dict(torch.load(save_path))
def update_learning_rate():
pass
================================================
FILE: MegaDepth/models/models.py
================================================
def create_model(opt,pretrained=None):
model = None
from .HG_model import HGModel
model = HGModel(opt,pretrained)
# print("model [%s] was created" % (model.name()))
return model
================================================
FILE: MegaDepth/options/__init__.py
================================================
================================================
FILE: MegaDepth/options/base_options.py
================================================
import argparse
import os
from ..util import util
class BaseOptions():
def __init__(self):
self.parser = argparse.ArgumentParser()
self.initialized = False
def initialize(self):
# self.parser.add_argument('--dataroot', required=True, help='path to images (should have subfolders trainA, trainB, valA, valB, etc)')
self.parser.add_argument('--batchSize', type=int, default=1, help='input batch size')
self.parser.add_argument('--loadSize', type=int, default=286, help='scale images to this size')
self.parser.add_argument('--fineSize', type=int, default=256, help='then crop to this size')
self.parser.add_argument('--input_nc', type=int, default=3, help='# of input image channels')
self.parser.add_argument('--output_nc', type=int, default=3, help='# of output image channels')
self.parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in first conv layer')
self.parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in first conv layer')
# self.parser.add_argument('--which_model_netD', type=str, default='basic', help='selects model to use for netD')
self.parser.add_argument('--which_model_netG', type=str, default='unet_256', help='selects model to use for netG')
# self.parser.add_argument('--n_layers_D', type=int, default=3, help='only used if which_model_netD==n_layers')
self.parser.add_argument('--gpu_ids', type=str, default='0,1', help='gpu ids: e.g. 0 0,1,2, 0,2')
self.parser.add_argument('--name', type=str, default='test_local', help='name of the experiment. It decides where to store samples and models')
# self.parser.add_argument('--align_data', action='store_true',
# help='if True, the datasets are loaded from "test" and "train" directories and the data pairs are aligned')
self.parser.add_argument('--model', type=str, default='pix2pix',
help='chooses which model to use. cycle_gan, one_direction_test, pix2pix, ...')
# self.parser.add_argument('--which_direction', type=str, default='AtoB', help='AtoB or BtoA')
self.parser.add_argument('--nThreads', default=2, type=int, help='# threads for loading data')
self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints/', help='models are saved here')
self.parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization')
self.parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
self.parser.add_argument('--display_winsize', type=int, default=256, help='display window size')
self.parser.add_argument('--display_id', type=int, default=1, help='window id of the web display')
self.parser.add_argument('--identity', type=float, default=0.0, help='use identity mapping. Setting identity other than 1 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set optidentity = 0.1')
self.parser.add_argument('--use_dropout', action='store_true', help='use dropout for the generator')
self.parser.add_argument('--max_dataset_size', type=int, default=float("inf"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')
self.initialized = True
def parse(self):
if not self.initialized:
self.initialize()
self.opt = self.parser.parse_known_args()[0] #parse_args()
self.opt.isTrain = self.isTrain # train or test
str_ids = self.opt.gpu_ids.split(',')
self.opt.gpu_ids = []
for str_id in str_ids:
id = int(str_id)
if id >= 0:
self.opt.gpu_ids.append(id)
args = vars(self.opt)
# print('------------ Options -------------')
# for k, v in sorted(args.items()):
# print('%s: %s' % (str(k), str(v)))
# print('-------------- End ----------------')
# save to the disk
expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.name)
util.mkdirs(expr_dir)
file_name = os.path.join(expr_dir, 'opt.txt')
with open(file_name, 'wt') as opt_file:
opt_file.write('------------ Options -------------\n')
for k, v in sorted(args.items()):
opt_file.write('%s: %s\n' % (str(k), str(v)))
opt_file.write('-------------- End ----------------\n')
return self.opt
================================================
FILE: MegaDepth/options/test_options.py
================================================
from .base_options import BaseOptions
class TestOptions(BaseOptions):
def initialize(self):
BaseOptions.initialize(self)
self.parser.add_argument('--ntest', type=int, default=float("inf"), help='# of test examples.')
self.parser.add_argument('--results_dir', type=str, default='./results/', help='saves results here.')
self.parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')
self.parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
self.parser.add_argument('--how_many', type=int, default=50, help='how many test images to run')
self.isTrain = False
================================================
FILE: MegaDepth/options/train_options.py
================================================
from .base_options import BaseOptions
class TrainOptions(BaseOptions):
def initialize(self):
BaseOptions.initialize(self)
self.parser.add_argument('--display_freq', type=int, default=100, help='frequency of showing training results on screen')
self.parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')
self.parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')
self.parser.add_argument('--save_epoch_freq', type=int, default=5, help='frequency of saving checkpoints at the end of epochs')
self.parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')
self.parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')
self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
self.parser.add_argument('--niter', type=int, default=100, help='# of iter at starting learning rate')
self.parser.add_argument('--niter_decay', type=int, default=100, help='# of iter to linearly decay learning rate to zero')
self.parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam')
self.parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate for adam')
self.parser.add_argument('--no_lsgan', action='store_true', help='do *not* use least square GAN, if false, use vanilla GAN')
self.parser.add_argument('--lambda_A', type=float, default=10.0, help='weight for cycle loss (A -> B -> A)')
self.parser.add_argument('--lambda_B', type=float, default=10.0, help='weight for cycle loss (B -> A -> B)')
self.parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images')
self.parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/')
self.parser.add_argument('--no_flip' , action='store_true', help='if specified, do not flip the images for data argumentation')
# NOT-IMPLEMENTED self.parser.add_argument('--preprocessing', type=str, default='resize_and_crop', help='resizing/cropping strategy')
self.isTrain = True
================================================
FILE: MegaDepth/pytorch_DIW_scratch.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from functools import reduce
class LambdaBase(nn.Sequential):
def __init__(self, fn, *args):
super(LambdaBase, self).__init__(*args)
self.lambda_func = fn
def forward_prepare(self, input):
output = []
for module in self._modules.values():
output.append(module(input))
return output if output else input
class Lambda(LambdaBase):
def forward(self, input):
return self.lambda_func(self.forward_prepare(input))
class LambdaMap(LambdaBase):
def forward(self, input):
return list(map(self.lambda_func,self.forward_prepare(input)))
class LambdaReduce(LambdaBase):
def forward(self, input):
return reduce(self.lambda_func,self.forward_prepare(input))
pytorch_DIW_scratch = nn.Sequential( # Sequential,
nn.Conv2d(3,128,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Sequential( # Sequential,
LambdaMap(lambda x: x, # ConcatTable,
nn.Sequential( # Sequential,
nn.MaxPool2d((2, 2),(2, 2)),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.Sequential( # Sequential,
LambdaMap(lambda x: x, # ConcatTable,
nn.Sequential( # Sequential,
nn.MaxPool2d((2, 2),(2, 2)),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.Sequential( # Sequential,
LambdaMap(lambda x: x, # ConcatTable,
nn.Sequential( # Sequential,
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
),
nn.Sequential( # Sequential,
nn.AvgPool2d((2, 2),(2, 2)),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.Sequential( # Sequential,
LambdaMap(lambda x: x, # ConcatTable,
nn.Sequential( # Sequential,
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
),
nn.Sequential( # Sequential,
nn.AvgPool2d((2, 2),(2, 2)),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.UpsamplingNearest2d(scale_factor=2),
),
),
LambdaReduce(lambda x,y: x+y), # CAddTable,
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.UpsamplingNearest2d(scale_factor=2),
),
),
LambdaReduce(lambda x,y: x+y), # CAddTable,
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(256,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.UpsamplingNearest2d(scale_factor=2),
),
nn.Sequential( # Sequential,
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(11, 11),(1, 1),(5, 5)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
),
),
LambdaReduce(lambda x,y: x+y), # CAddTable,
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(5, 5),(1, 1),(2, 2)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
),
),
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,16,(1, 1)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,16,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,16,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,32,(1, 1)),
nn.BatchNorm2d(32,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(32,16,(11, 11),(1, 1),(5, 5)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
),
nn.UpsamplingNearest2d(scale_factor=2),
),
nn.Sequential( # Sequential,
LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
nn.Sequential( # Sequential,
nn.Conv2d(128,16,(1, 1)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,16,(3, 3),(1, 1),(1, 1)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,16,(7, 7),(1, 1),(3, 3)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
nn.Sequential( # Sequential,
nn.Conv2d(128,64,(1, 1)),
nn.BatchNorm2d(64,1e-05,0.1,False),
nn.ReLU(),
nn.Conv2d(64,16,(11, 11),(1, 1),(5, 5)),
nn.BatchNorm2d(16,1e-05,0.1,False),
nn.ReLU(),
),
),
),
),
LambdaReduce(lambda x,y: x+y), # CAddTable,
),
nn.Conv2d(64,1,(3, 3),(1, 1),(1, 1)),
)
================================================
FILE: MegaDepth/rmse_error_main.py
================================================
import time
import torch
import sys
from options.train_options import TrainOptions
opt = TrainOptions().parse() # set CUDA_VISIBLE_DEVICES before import torch
from data.data_loader import CreateDataLoader
from models.models import create_model
dataset_root = "/phoenix/S6/zl548/"
test_list_dir_l = '/phoenix/S6/zl548/MegaDpeth_code/test_list/landscape/'
input_height = 240
input_width = 320
is_flipped = False
shuffle = False
test_data_loader_l = CreateDataLoader(dataset_root, test_list_dir_l, input_height, input_width, is_flipped, shuffle)
test_dataset_l = test_data_loader_l.load_data()
test_dataset_size_l = len(test_data_loader_l)
print('========================= test images = %d' % test_dataset_size_l)
test_list_dir_p = '/phoenix/S6/zl548/MegaDpeth_code/test_list/portrait/'
input_height = 320
input_width = 240
test_data_loader_p = CreateDataLoader(dataset_root, test_list_dir_p, input_height, input_width, is_flipped, shuffle)
test_dataset_p = test_data_loader_p.load_data()
test_dataset_size_p = len(test_data_loader_p)
print('========================= test images = %d' % test_dataset_size_p)
model = create_model(opt)
def test(model):
total_loss =0
toal_count = 0
print("============================= TEST ============================")
model.switch_to_eval()
for i, data in enumerate(test_dataset_l):
stacked_img = data['img_1']
targets = data['target_1']
rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)
total_loss += rmse_loss
toal_count += count
print('RMSE loss is', total_loss/float(toal_count))
for i, data in enumerate(test_dataset_p):
stacked_img = data['img_1']
targets = data['target_1']
rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)
total_loss += rmse_loss
toal_count += count
print('RMSE loss is', total_loss/float(toal_count))
print('average RMSE loss is', total_loss/float(toal_count))
print("WE ARE IN TESTING RMSE!!!!")
test(model)
print("WE ARE DONE TESTING!!!")
print("We are done")
================================================
FILE: MegaDepth/util/__init__.py
================================================
================================================
FILE: MegaDepth/util/html.py
================================================
import dominate
from dominate.tags import *
import os
class HTML:
def __init__(self, web_dir, title, reflesh=0):
self.title = title
self.web_dir = web_dir
self.img_dir = os.path.join(self.web_dir, 'images')
if not os.path.exists(self.web_dir):
os.makedirs(self.web_dir)
if not os.path.exists(self.img_dir):
os.makedirs(self.img_dir)
# print(self.img_dir)
self.doc = dominate.document(title=title)
if reflesh > 0:
with self.doc.head:
meta(http_equiv="reflesh", content=str(reflesh))
def get_image_dir(self):
return self.img_dir
def add_header(self, str):
with self.doc:
h3(str)
def add_table(self, border=1):
self.t = table(border=border, style="table-layout: fixed;")
self.doc.add(self.t)
def add_images(self, ims, txts, links, width=400):
self.add_table()
with self.t:
with tr():
for im, txt, link in zip(ims, txts, links):
with td(style="word-wrap: break-word;", halign="center", valign="top"):
with p():
with a(href=os.path.join('images', link)):
img(style="width:%dpx" % width, src=os.path.join('images', im))
br()
p(txt)
def save(self):
html_file = '%s/index.html' % self.web_dir
f = open(html_file, 'wt')
f.write(self.doc.render())
f.close()
if __name__ == '__main__':
html = HTML('web/', 'test_html')
html.add_header('hello world')
ims = []
txts = []
links = []
for n in range(4):
ims.append('image_%d.png' % n)
txts.append('text_%d' % n)
links.append('image_%d.png' % n)
html.add_images(ims, txts, links)
html.save()
================================================
FILE: MegaDepth/util/image_pool.py
================================================
import random
import numpy as np
import torch
from pdb import set_trace as st
from torch.autograd import Variable
class ImagePool():
def __init__(self, pool_size):
self.pool_size = pool_size
if self.pool_size > 0:
self.num_imgs = 0
self.images = []
def query(self, images):
if self.pool_size == 0:
return images
return_images = []
for image in images.data:
image = torch.unsqueeze(image, 0)
if self.num_imgs < self.pool_size:
self.num_imgs = self.num_imgs + 1
self.images.append(image)
return_images.append(image)
else:
p = random.uniform(0, 1)
if p > 0.5:
random_id = random.randint(0, self.pool_size-1)
tmp = self.images[random_id].clone()
self.images[random_id] = image
return_images.append(tmp)
else:
return_images.append(image)
return_images = Variable(torch.cat(return_images, 0))
return return_images
================================================
FILE: MegaDepth/util/png.py
================================================
import struct
import zlib
def encode(buf, width, height):
""" buf: must be bytes or a bytearray in py3, a regular string in py2. formatted RGBRGB... """
assert (width * height * 3 == len(buf))
bpp = 3
def raw_data():
# reverse the vertical line order and add null bytes at the start
row_bytes = width * bpp
for row_start in range((height - 1) * width * bpp, -1, -row_bytes):
yield b'\x00'
yield buf[row_start:row_start + row_bytes]
def chunk(tag, data):
return [
struct.pack("!I", len(data)),
tag,
data,
struct.pack("!I", 0xFFFFFFFF & zlib.crc32(data, zlib.crc32(tag)))
]
SIGNATURE = b'\x89PNG\r\n\x1a\n'
COLOR_TYPE_RGB = 2
COLOR_TYPE_RGBA = 6
bit_depth = 8
return b''.join(
[ SIGNATURE ] +
chunk(b'IHDR', struct.pack("!2I5B", width, height, bit_depth, COLOR_TYPE_RGB, 0, 0, 0)) +
chunk(b'IDAT', zlib.compress(b''.join(raw_data()), 9)) +
chunk(b'IEND', b'')
)
================================================
FILE: MegaDepth/util/util.py
================================================
from __future__ import print_function
import torch
import numpy as np
from PIL import Image
import inspect, re
import numpy as np
import os
import collections
# Converts a Tensor into a Numpy array
# |imtype|: the desired type of the converted numpy array
def tensor2im(image_tensor, imtype=np.uint8):
image_numpy = image_tensor[0].cpu().float().numpy()
image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0
return image_numpy.astype(imtype)
def diagnose_network(net, name='network'):
mean = 0.0
count = 0
for param in net.parameters():
if param.grad is not None:
mean += torch.mean(torch.abs(param.grad.data))
count += 1
if count > 0:
mean = mean / count
print(name)
print(mean)
def save_image(image_numpy, image_path):
image_pil = Image.fromarray(image_numpy)
image_pil.save(image_path)
def info(object, spacing=10, collapse=1):
"""Print methods and doc strings.
Takes module, class, list, dictionary, or string."""
methodList = [e for e in dir(object) if isinstance(getattr(object, e), collections.Callable)]
processFunc = collapse and (lambda s: " ".join(s.split())) or (lambda s: s)
print( "\n".join(["%s %s" %
(method.ljust(spacing),
processFunc(str(getattr(object, method).__doc__)))
for method in methodList]) )
def varname(p):
for line in inspect.getframeinfo(inspect.currentframe().f_back)[3]:
m = re.search(r'\bvarname\s*\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)', line)
if m:
return m.group(1)
def print_numpy(x, val=True, shp=False):
x = x.astype(np.float64)
if shp:
print('shape,', x.shape)
if val:
x = x.flatten()
print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
def mkdirs(paths):
if isinstance(paths, list) and not isinstance(paths, str):
for path in paths:
mkdir(path)
else:
mkdir(paths)
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
================================================
FILE: MegaDepth/util/visualizer.py
================================================
import numpy as np
import os
import ntpath
import time
from . import util
from . import html
class Visualizer():
def __init__(self, opt):
# self.opt = opt
self.display_id = opt.display_id
self.use_html = opt.isTrain and not opt.no_html
self.win_size = opt.display_winsize
self.name = opt.name
if self.display_id > 0:
import visdom
self.vis = visdom.Visdom()
if self.use_html:
self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')
self.img_dir = os.path.join(self.web_dir, 'images')
print('create web directory %s...' % self.web_dir)
util.mkdirs([self.web_dir, self.img_dir])
# |visuals|: dictionary of images to display or save
def display_current_results(self, visuals, epoch):
if self.display_id > 0: # show images in the browser
idx = 1
for label, image_numpy in visuals.items():
#image_numpy = np.flipud(image_numpy)
self.vis.image(image_numpy.transpose([2,0,1]), opts=dict(title=label),
win=self.display_id + idx)
idx += 1
if self.use_html: # save images to a html file
for label, image_numpy in visuals.items():
img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))
util.save_image(image_numpy, img_path)
# update website
webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, reflesh=1)
for n in range(epoch, 0, -1):
webpage.add_header('epoch [%d]' % n)
ims = []
txts = []
links = []
for label, image_numpy in visuals.items():
img_path = 'epoch%.3d_%s.png' % (n, label)
ims.append(img_path)
txts.append(label)
links.append(img_path)
webpage.add_images(ims, txts, links, width=self.win_size)
webpage.save()
# errors: dictionary of error labels and values
def plot_current_errors(self, epoch, counter_ratio, opt, errors):
if not hasattr(self, 'plot_data'):
self.plot_data = {'X':[],'Y':[], 'legend':list(errors.keys())}
self.plot_data['X'].append(epoch + counter_ratio)
self.plot_data['Y'].append([errors[k] for k in self.plot_data['legend']])
self.vis.line(
X=np.stack([np.array(self.plot_data['X'])]*len(self.plot_data['legend']),1),
Y=np.array(self.plot_data['Y']),
opts={
'title': self.name + ' loss over time',
'legend': self.plot_data['legend'],
'xlabel': 'epoch',
'ylabel': 'loss'},
win=self.display_id)
# errors: same format as |errors| of plotCurrentErrors
def print_current_errors(self, epoch, i, errors, t):
message = '(epoch: %d, iters: %d, time: %.3f) ' % (epoch, i, t)
for k, v in errors.items():
message += '%s: %.3f ' % (k, v)
print(message)
# save image to the disk
def save_images(self, webpage, visuals, image_path):
image_dir = webpage.get_image_dir()
short_path = ntpath.basename(image_path[0])
name = os.path.splitext(short_path)[0]
webpage.add_header(name)
ims = []
txts = []
links = []
for label, image_numpy in visuals.items():
image_name = '%s_%s.png' % (name, label)
save_path = os.path.join(image_dir, image_name)
util.save_image(image_numpy, save_path)
ims.append(image_name)
txts.append(label)
links.append(image_name)
webpage.add_images(ims, txts, links, width=self.win_size)
================================================
FILE: PWCNet/PWCNet.py
================================================
"""
implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018
Jinwei Gu and Zhile Ren
"""
import torch
import torch.nn as nn
from torch.autograd import Variable
import os
os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory
#from .correlation_package.modules.corr import Correlation
# from PWCNet.correlation_package_pytorch0_4.correlation import Correlation #pytorch0.4 version
from PWCNet.correlation_package_pytorch1_0.correlation import Correlation #pytorch0.4 version
import numpy as np
__all__ = [
'pwc_dc_net', 'pwc_dc_net_old'
]
def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
return nn.Sequential(
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, bias=True),
nn.LeakyReLU(0.1))
def predict_flow(in_planes):
return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)
def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)
import time
class PWCDCNet(nn.Module):
"""
PWC-DC net. add dilation convolution and densenet connections
"""
def __init__(self, md=4):
"""
input: md --- maximum displacement (for correlation. default: 4), after warpping
"""
super(PWCDCNet,self).__init__()
self.conv1a = conv(3, 16, kernel_size=3, stride=2)
self.conv1aa = conv(16, 16, kernel_size=3, stride=1)
self.conv1b = conv(16, 16, kernel_size=3, stride=1)
self.conv2a = conv(16, 32, kernel_size=3, stride=2)
self.conv2aa = conv(32, 32, kernel_size=3, stride=1)
self.conv2b = conv(32, 32, kernel_size=3, stride=1)
self.conv3a = conv(32, 64, kernel_size=3, stride=2)
self.conv3aa = conv(64, 64, kernel_size=3, stride=1)
self.conv3b = conv(64, 64, kernel_size=3, stride=1)
self.conv4a = conv(64, 96, kernel_size=3, stride=2)
self.conv4aa = conv(96, 96, kernel_size=3, stride=1)
self.conv4b = conv(96, 96, kernel_size=3, stride=1)
self.conv5a = conv(96, 128, kernel_size=3, stride=2)
self.conv5aa = conv(128,128, kernel_size=3, stride=1)
self.conv5b = conv(128,128, kernel_size=3, stride=1)
self.conv6aa = conv(128,196, kernel_size=3, stride=2)
self.conv6a = conv(196,196, kernel_size=3, stride=1)
self.conv6b = conv(196,196, kernel_size=3, stride=1)
self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
self.leakyRELU = nn.LeakyReLU(0.1)
nd = (2*md+1)**2
dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)
dd = [int(d) for d in dd]
od = nd
self.conv6_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow6 = predict_flow(od+dd[4])
self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+128+4
self.conv5_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow5 = predict_flow(od+dd[4])
self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+96+4
self.conv4_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow4 = predict_flow(od+dd[4])
self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+64+4
self.conv3_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow3 = predict_flow(od+dd[4])
self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+32+4
self.conv2_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow2 = predict_flow(od+dd[4])
self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2)
self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4)
self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8)
self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16)
self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv7 = predict_flow(32)
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal_(m.weight.data, mode='fan_in')
if m.bias is not None:
m.bias.data.zero_()
W_MAX = 2048
H_MAX = 1024
B_MAX = 3
xx = torch.arange(0, W_MAX).view(1,-1).cuda().repeat(H_MAX,1)
yy = torch.arange(0, H_MAX).view(-1,1).cuda().repeat(1,W_MAX)
xx = xx.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)
yy = yy.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)
grid = torch.cat((xx,yy),1).float()
## for saving time on allocating a grid in forward
self.W_MAX = W_MAX
self.H_MAX = H_MAX
self.B_MAX = B_MAX
self.grid = Variable(grid, requires_grad=False)
# self.mask_base = Variable(torch.cuda.FloatTensor().resize_(B_MAX,).zero_() + 1)
def warp(self, x, flo):
"""
warp an image/tensor (im2) back to im1, according to the optical flow
x: [B, C, H, W] (im2)
flo: [B, 2, H, W] flow
"""
B, C, H, W = x.size()
# mesh grid
# xx = torch.arange(0, W).view(1,-1).cuda().repeat(H,1)
# yy = torch.arange(0, H).view(-1,1).cuda().repeat(1,W)
# xx = xx.view(1,1,H,W).repeat(B,1,1,1)
# yy = yy.view(1,1,H,W).repeat(B,1,1,1)
# grid = torch.cat((xx,yy),1).float()
# # if x.is_cuda:
# # grid = grid.cuda()
# vgrid = Variable(grid) + flo
assert(B <= self.B_MAX and H <= self.H_MAX and W <= self.W_MAX)
vgrid = self.grid[:B,:,:H,:W] +flo
# scale grid to [-1,1]
vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone()/max(W-1,1)-1.0
vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone()/max(H-1,1)-1.0
vgrid = vgrid.permute(0,2,3,1)
output = nn.functional.grid_sample(x, vgrid)
# mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
mask = torch.autograd.Variable(torch.cuda.FloatTensor().resize_(x.size()).zero_() + 1, requires_grad = False)
mask = nn.functional.grid_sample(mask, vgrid)
# if W==128:
# np.save('mask.npy', mask.cpu().data.numpy())
# np.save('warp.npy', output.cpu().data.numpy())
mask[mask<0.9999] = 0
mask[mask>0] = 1
return output*mask
def forward(self,x, output_more = False):
im1 = x[:,:3,:,:]
im2 = x[:,3:,:,:]
# print("\n\n***************************PWC Net details *************** \n\n")
# start= time.time()
c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))
c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))
c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))
c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))
c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))
c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))
c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))
c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))
c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))
c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))
c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))
c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))
# print("features " +str(time.time()- start))
# start= time.time()
corr6 = self.corr(c16, c26)
corr6 = self.leakyRELU(corr6)
x = torch.cat((self.conv6_0(corr6), corr6),1)
x = torch.cat((self.conv6_1(x), x),1)
x = torch.cat((self.conv6_2(x), x),1)
x = torch.cat((self.conv6_3(x), x),1)
x = torch.cat((self.conv6_4(x), x),1)
flow6 = self.predict_flow6(x)
up_flow6 = self.deconv6(flow6)
up_feat6 = self.upfeat6(x)
# print("level6 " +str(time.time()- start))
# start= time.time()
warp5 = self.warp(c25, up_flow6*0.625)
# print("level5_1 " + str(time.time() - start))
# start5 = time.time()
corr5 = self.corr(c15, warp5)
# print("level5_2 " + str(time.time() - start5))
# start5 = time.time()
corr5 = self.leakyRELU(corr5)
x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
x = torch.cat((self.conv5_0(x), x),1)
x = torch.cat((self.conv5_1(x), x),1)
x = torch.cat((self.conv5_2(x), x),1)
x = torch.cat((self.conv5_3(x), x),1)
x = torch.cat((self.conv5_4(x), x),1)
flow5 = self.predict_flow5(x)
up_flow5 = self.deconv5(flow5)
up_feat5 = self.upfeat5(x)
# print("level5_3 " + str(time.time() - start5))
# print("level5 " + str(time.time() - start))
# start = time.time()
warp4 = self.warp(c24, up_flow5*1.25)
corr4 = self.corr(c14, warp4)
corr4 = self.leakyRELU(corr4)
x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
x = torch.cat((self.conv4_0(x), x),1)
x = torch.cat((self.conv4_1(x), x),1)
x = torch.cat((self.conv4_2(x), x),1)
x = torch.cat((self.conv4_3(x), x),1)
x = torch.cat((self.conv4_4(x), x),1)
flow4 = self.predict_flow4(x)
up_flow4 = self.deconv4(flow4)
up_feat4 = self.upfeat4(x)
# print("level4 " + str(time.time() - start))
# start = time.time()
warp3 = self.warp(c23, up_flow4*2.5)
corr3 = self.corr(c13, warp3)
corr3 = self.leakyRELU(corr3)
x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
x = torch.cat((self.conv3_0(x), x),1)
x = torch.cat((self.conv3_1(x), x),1)
x = torch.cat((self.conv3_2(x), x),1)
x = torch.cat((self.conv3_3(x), x),1)
x = torch.cat((self.conv3_4(x), x),1)
flow3 = self.predict_flow3(x)
up_flow3 = self.deconv3(flow3)
up_feat3 = self.upfeat3(x)
# print("level3 " + str(time.time() - start))
# start = time.time()
warp2 = self.warp(c22, up_flow3*5.0)
corr2 = self.corr(c12, warp2)
corr2 = self.leakyRELU(corr2)
x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
x = torch.cat((self.conv2_0(x), x),1)
x = torch.cat((self.conv2_1(x), x),1)
x = torch.cat((self.conv2_2(x), x),1)
x = torch.cat((self.conv2_3(x), x),1)
x = torch.cat((self.conv2_4(x), x),1)
flow2 = self.predict_flow2(x)
# print("level2 " + str(time.time() - start))
# start = time.time()
x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
# print("refine " + str(time.time() - start))
# start = time.time()
# we don't have the gt for flow, we just fine tune it on flownets
if not output_more:
return flow2
else:
return [flow2,flow3,flow4,flow5,flow6]
# if self.training:
# return flow2,flow3,flow4,flow5,flow6
# else:
# return flow2
class PWCDCNet_old(nn.Module):
"""
PWC-DC net. add dilation convolution and densenet connections
"""
def __init__(self, md=4):
"""
input: md --- maximum displacement (for correlation. default: 4), after warpping
"""
super(PWCDCNet_old,self).__init__()
self.conv1a = conv(3, 16, kernel_size=3, stride=2)
self.conv1b = conv(16, 16, kernel_size=3, stride=1)
self.conv2a = conv(16, 32, kernel_size=3, stride=2)
self.conv2b = conv(32, 32, kernel_size=3, stride=1)
self.conv3a = conv(32, 64, kernel_size=3, stride=2)
self.conv3b = conv(64, 64, kernel_size=3, stride=1)
self.conv4a = conv(64, 96, kernel_size=3, stride=2)
self.conv4b = conv(96, 96, kernel_size=3, stride=1)
self.conv5a = conv(96, 128, kernel_size=3, stride=2)
self.conv5b = conv(128,128, kernel_size=3, stride=1)
self.conv6a = conv(128,196, kernel_size=3, stride=2)
self.conv6b = conv(196,196, kernel_size=3, stride=1)
self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
self.leakyRELU = nn.LeakyReLU(0.1)
nd = (2*md+1)**2
dd = np.cumsum([128,128,96,64,32])
od = nd
self.conv6_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow6 = predict_flow(od+dd[4])
self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+128+4
self.conv5_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow5 = predict_flow(od+dd[4])
self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+96+4
self.conv4_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow4 = predict_flow(od+dd[4])
self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+64+4
self.conv3_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow3 = predict_flow(od+dd[4])
self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+32+4
self.conv2_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow2 = predict_flow(od+dd[4])
self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2)
self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4)
self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8)
self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16)
self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv7 = predict_flow(32)
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal(m.weight.data, mode='fan_in')
if m.bias is not None:
m.bias.data.zero_()
def warp(self, x, flo):
"""
warp an image/tensor (im2) back to im1, according to the optical flow
x: [B, C, H, W] (im2)
flo: [B, 2, H, W] flow
"""
B, C, H, W = x.size()
# mesh grid
xx = torch.arange(0, W).view(1,-1).repeat(H,1)
yy = torch.arange(0, H).view(-1,1).repeat(1,W)
xx = xx.view(1,1,H,W).repeat(B,1,1,1)
yy = yy.view(1,1,H,W).repeat(B,1,1,1)
grid = torch.cat((xx,yy),1).float()
if x.is_cuda:
grid = grid.cuda()
vgrid = Variable(grid) + flo
# scale grid to [-1,1]
vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0
vgrid = vgrid.permute(0,2,3,1)
output = nn.functional.grid_sample(x, vgrid)
mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
mask = nn.functional.grid_sample(mask, vgrid)
mask[mask<0.999] = 0
mask[mask>0] = 1
return output*mask
def forward(self,x):
im1 = x[:,:3,:,:]
im2 = x[:,3:,:,:]
c11 = self.conv1b(self.conv1a(im1))
c21 = self.conv1b(self.conv1a(im2))
c12 = self.conv2b(self.conv2a(c11))
c22 = self.conv2b(self.conv2a(c21))
c13 = self.conv3b(self.conv3a(c12))
c23 = self.conv3b(self.conv3a(c22))
c14 = self.conv4b(self.conv4a(c13))
c24 = self.conv4b(self.conv4a(c23))
c15 = self.conv5b(self.conv5a(c14))
c25 = self.conv5b(self.conv5a(c24))
c16 = self.conv6b(self.conv6a(c15))
c26 = self.conv6b(self.conv6a(c25))
corr6 = self.corr(c16, c26)
corr6 = self.leakyRELU(corr6)
x = torch.cat((corr6, self.conv6_0(corr6)),1)
x = torch.cat((self.conv6_1(x), x),1)
x = torch.cat((x, self.conv6_2(x)),1)
x = torch.cat((x, self.conv6_3(x)),1)
x = torch.cat((x, self.conv6_4(x)),1)
flow6 = self.predict_flow6(x)
up_flow6 = self.deconv6(flow6)
up_feat6 = self.upfeat6(x)
warp5 = self.warp(c25, up_flow6*0.625)
corr5 = self.corr(c15, warp5)
corr5 = self.leakyRELU(corr5)
x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
x = torch.cat((x, self.conv5_0(x)),1)
x = torch.cat((self.conv5_1(x), x),1)
x = torch.cat((x, self.conv5_2(x)),1)
x = torch.cat((x, self.conv5_3(x)),1)
x = torch.cat((x, self.conv5_4(x)),1)
flow5 = self.predict_flow5(x)
up_flow5 = self.deconv5(flow5)
up_feat5 = self.upfeat5(x)
warp4 = self.warp(c24, up_flow5*1.25)
corr4 = self.corr(c14, warp4)
corr4 = self.leakyRELU(corr4)
x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
x = torch.cat((x, self.conv4_0(x)),1)
x = torch.cat((self.conv4_1(x), x),1)
x = torch.cat((x, self.conv4_2(x)),1)
x = torch.cat((x, self.conv4_3(x)),1)
x = torch.cat((x, self.conv4_4(x)),1)
flow4 = self.predict_flow4(x)
up_flow4 = self.deconv4(flow4)
up_feat4 = self.upfeat4(x)
warp3 = self.warp(c23, up_flow4*2.5)
corr3 = self.corr(c13, warp3)
corr3 = self.leakyRELU(corr3)
x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
x = torch.cat((x, self.conv3_0(x)),1)
x = torch.cat((self.conv3_1(x), x),1)
x = torch.cat((x, self.conv3_2(x)),1)
x = torch.cat((x, self.conv3_3(x)),1)
x = torch.cat((x, self.conv3_4(x)),1)
flow3 = self.predict_flow3(x)
up_flow3 = self.deconv3(flow3)
up_feat3 = self.upfeat3(x)
warp2 = self.warp(c22, up_flow3*5.0)
corr2 = self.corr(c12, warp2)
corr2 = self.leakyRELU(corr2)
x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
x = torch.cat((x, self.conv2_0(x)),1)
x = torch.cat((self.conv2_1(x), x),1)
x = torch.cat((x, self.conv2_2(x)),1)
x = torch.cat((x, self.conv2_3(x)),1)
x = torch.cat((x, self.conv2_4(x)),1)
flow2 = self.predict_flow2(x)
x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
if self.training:
return flow2,flow3,flow4,flow5,flow6
else:
return flow2
def pwc_dc_net(path=None):
model = PWCDCNet()
if path is not None:
data = torch.load(path)
if 'state_dict' in data.keys():
model.load_state_dict(data['state_dict'])
else:
model.load_state_dict(data)
return model
def pwc_dc_net_old(path=None):
model = PWCDCNet_old()
if path is not None:
data = torch.load(path)
if 'state_dict' in data.keys():
model.load_state_dict(data['state_dict'])
else:
model.load_state_dict(data)
return model
================================================
FILE: PWCNet/__init__.py
================================================
from .PWCNet import *
================================================
FILE: PWCNet/correlation_package_pytorch1_0/__init__.py
================================================
================================================
FILE: PWCNet/correlation_package_pytorch1_0/build.sh
================================================
#!/usr/bin/env bash
echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0
export PYTHONPATH=$PYTHONPATH:$(pwd)/../../my_package
rm -rf build *.egg-info dist
python setup.py install
================================================
FILE: PWCNet/correlation_package_pytorch1_0/clean.sh
================================================
#!/usr/bin/env bash
echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0
rm -rf build *.egg-info dist
#python setup.py install
================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation.py
================================================
import torch
from torch.nn.modules.module import Module
from torch.autograd import Function
import correlation_cuda
class CorrelationFunction(Function):
def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
super(CorrelationFunction, self).__init__()
self.pad_size = pad_size
self.kernel_size = kernel_size
self.max_displacement = max_displacement
self.stride1 = stride1
self.stride2 = stride2
self.corr_multiply = corr_multiply
# self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)
def forward(self, input1, input2):
self.save_for_backward(input1, input2)
with torch.cuda.device_of(input1):
rbot1 = input1.new()
rbot2 = input2.new()
output = input1.new()
correlation_cuda.forward(input1, input2, rbot1, rbot2, output,
self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)
return output
def backward(self, grad_output):
input1, input2 = self.saved_tensors
with torch.cuda.device_of(input1):
rbot1 = input1.new()
rbot2 = input2.new()
grad_input1 = input1.new()
grad_input2 = input2.new()
correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)
return grad_input1, grad_input2
class Correlation(Module):
def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
super(Correlation, self).__init__()
self.pad_size = pad_size
self.kernel_size = kernel_size
self.max_displacement = max_displacement
self.stride1 = stride1
self.stride2 = stride2
self.corr_multiply = corr_multiply
def forward(self, input1, input2):
result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2)
return result
================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "correlation_cuda_kernel.cuh"
int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply)
{
int batchSize = input1.size(0);
int nInputChannels = input1.size(1);
int inputHeight = input1.size(2);
int inputWidth = input1.size(3);
int kernel_radius = (kernel_size - 1) / 2;
int border_radius = kernel_radius + max_displacement;
int paddedInputHeight = inputHeight + 2 * pad_size;
int paddedInputWidth = inputWidth + 2 * pad_size;
int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);
int outputHeight = ceil(static_cast(paddedInputHeight - 2 * border_radius) / static_cast(stride1));
int outputwidth = ceil(static_cast(paddedInputWidth - 2 * border_radius) / static_cast(stride1));
rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});
rInput1.fill_(0);
rInput2.fill_(0);
output.fill_(0);
int success = correlation_forward_cuda_kernel(
output,
output.size(0),
output.size(1),
output.size(2),
output.size(3),
output.stride(0),
output.stride(1),
output.stride(2),
output.stride(3),
input1,
input1.size(1),
input1.size(2),
input1.size(3),
input1.stride(0),
input1.stride(1),
input1.stride(2),
input1.stride(3),
input2,
input2.size(1),
input2.stride(0),
input2.stride(1),
input2.stride(2),
input2.stride(3),
rInput1,
rInput2,
pad_size,
kernel_size,
max_displacement,
stride1,
stride2,
corr_type_multiply,
// at::globalContext().getCurrentCUDAStream() //works for 0.4.1
at::cuda::getCurrentCUDAStream() //works for 1.0.0
);
//check for errors
if (!success) {
AT_ERROR("CUDA call failed");
}
return 1;
}
int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput,
at::Tensor& gradInput1, at::Tensor& gradInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply)
{
int batchSize = input1.size(0);
int nInputChannels = input1.size(1);
int paddedInputHeight = input1.size(2)+ 2 * pad_size;
int paddedInputWidth = input1.size(3)+ 2 * pad_size;
int height = input1.size(2);
int width = input1.size(3);
rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
gradInput1.resize_({batchSize, nInputChannels, height, width});
gradInput2.resize_({batchSize, nInputChannels, height, width});
rInput1.fill_(0);
rInput2.fill_(0);
gradInput1.fill_(0);
gradInput2.fill_(0);
int success = correlation_backward_cuda_kernel(gradOutput,
gradOutput.size(0),
gradOutput.size(1),
gradOutput.size(2),
gradOutput.size(3),
gradOutput.stride(0),
gradOutput.stride(1),
gradOutput.stride(2),
gradOutput.stride(3),
input1,
input1.size(1),
input1.size(2),
input1.size(3),
input1.stride(0),
input1.stride(1),
input1.stride(2),
input1.stride(3),
input2,
input2.stride(0),
input2.stride(1),
input2.stride(2),
input2.stride(3),
gradInput1,
gradInput1.stride(0),
gradInput1.stride(1),
gradInput1.stride(2),
gradInput1.stride(3),
gradInput2,
gradInput2.size(1),
gradInput2.stride(0),
gradInput2.stride(1),
gradInput2.stride(2),
gradInput2.stride(3),
rInput1,
rInput2,
pad_size,
kernel_size,
max_displacement,
stride1,
stride2,
corr_type_multiply,
// at::globalContext().getCurrentCUDAStream() //works for 0.4.1
at::cuda::getCurrentCUDAStream() //works for 1.0.0
);
if (!success) {
AT_ERROR("CUDA call failed");
}
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)");
m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)");
}
================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cu
================================================
#include
#include "correlation_cuda_kernel.cuh"
#define CUDA_NUM_THREADS 1024
#define THREADS_PER_BLOCK 32
#define FULL_MASK 0xffffffff
#include
#include
#include
#include
using at::Half;
template
__forceinline__ __device__ scalar_t warpReduceSum(scalar_t val) {
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(FULL_MASK, val, offset);
return val;
}
template
__forceinline__ __device__ scalar_t blockReduceSum(scalar_t val) {
static __shared__ scalar_t shared[32];
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val);
if (lane == 0)
shared[wid] = val;
__syncthreads();
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid == 0)
val = warpReduceSum(val);
return val;
}
template
__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)
{
// n (batch size), c (num of channels), y (height), x (width)
int n = blockIdx.x;
int y = blockIdx.y;
int x = blockIdx.z;
int ch_off = threadIdx.x;
scalar_t value;
int dimcyx = channels * height * width;
int dimyx = height * width;
int p_dimx = (width + 2 * pad_size);
int p_dimy = (height + 2 * pad_size);
int p_dimyxc = channels * p_dimy * p_dimx;
int p_dimxc = p_dimx * channels;
for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {
value = input[n * dimcyx + c * dimyx + y * width + x];
rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;
}
}
template
__global__ void correlation_forward(scalar_t* __restrict__ output, const int nOutputChannels,
const int outputHeight, const int outputWidth, const scalar_t* __restrict__ rInput1,
const int nInputChannels, const int inputHeight, const int inputWidth,
const scalar_t* __restrict__ rInput2, const int pad_size, const int kernel_size,
const int max_displacement, const int stride1, const int stride2) {
int32_t pInputWidth = inputWidth + 2 * pad_size;
int32_t pInputHeight = inputHeight + 2 * pad_size;
int32_t kernel_rad = (kernel_size - 1) / 2;
int32_t displacement_rad = max_displacement / stride2;
int32_t displacement_size = 2 * displacement_rad + 1;
int32_t n = blockIdx.x;
int32_t y1 = blockIdx.y * stride1 + max_displacement;
int32_t x1 = blockIdx.z * stride1 + max_displacement;
int32_t c = threadIdx.x;
int32_t pdimyxc = pInputHeight * pInputWidth * nInputChannels;
int32_t pdimxc = pInputWidth * nInputChannels;
int32_t pdimc = nInputChannels;
int32_t tdimcyx = nOutputChannels * outputHeight * outputWidth;
int32_t tdimyx = outputHeight * outputWidth;
int32_t tdimx = outputWidth;
int32_t nelems = kernel_size * kernel_size * pdimc;
// element-wise product along channel axis
for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
int x2 = x1 + ti * stride2;
int y2 = y1 + tj * stride2;
float acc0 = 0.0f;
for (int j = -kernel_rad; j <= kernel_rad; ++j) {
for (int i = -kernel_rad; i <= kernel_rad; ++i) {
// THREADS_PER_BLOCK
#pragma unroll
for (int ch = c; ch < pdimc; ch += blockDim.x) {
int indx1 = n * pdimyxc + (y1 + j) * pdimxc
+ (x1 + i) * pdimc + ch;
int indx2 = n * pdimyxc + (y2 + j) * pdimxc
+ (x2 + i) * pdimc + ch;
acc0 += static_cast(rInput1[indx1] * rInput2[indx2]);
}
}
}
if (blockDim.x == warpSize) {
__syncwarp();
acc0 = warpReduceSum(acc0);
} else {
__syncthreads();
acc0 = blockReduceSum(acc0);
}
if (threadIdx.x == 0) {
int tc = (tj + displacement_rad) * displacement_size
+ (ti + displacement_rad);
const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx
+ blockIdx.z;
output[tindx] = static_cast(acc0 / nelems);
}
}
}
}
template
__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth,
const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
const scalar_t* __restrict__ rInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2)
{
// n (batch size), c (num of channels), y (height), x (width)
int n = item;
int y = blockIdx.x * stride1 + pad_size;
int x = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int xmin = (x - kernel_rad - max_displacement) / stride1;
int ymin = (y - kernel_rad - max_displacement) / stride1;
int xmax = (x + kernel_rad - max_displacement) / stride1;
int ymax = (y + kernel_rad - max_displacement) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
// assumes gradInput1 is pre-allocated and zero filled
return;
}
if (xmin > xmax || ymin > ymax) {
// assumes gradInput1 is pre-allocated and zero filled
return;
}
xmin = max(0,xmin);
xmax = min(outputWidth-1,xmax);
ymin = max(0,ymin);
ymax = min(outputHeight-1,ymax);
int pInputWidth = inputWidth + 2 * pad_size;
int pInputHeight = inputHeight + 2 * pad_size;
int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
int pdimxc = pInputWidth * nInputChannels;
int pdimc = nInputChannels;
int tdimcyx = nOutputChannels * outputHeight * outputWidth;
int tdimyx = outputHeight * outputWidth;
int tdimx = outputWidth;
int odimcyx = nInputChannels * inputHeight* inputWidth;
int odimyx = inputHeight * inputWidth;
int odimx = inputWidth;
scalar_t nelems = kernel_size * kernel_size * nInputChannels;
__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;
scalar_t val2 = rInput2[indx2];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
prod_sum[tch_off] += gradOutput[tindx] * val2;
}
}
}
__syncthreads();
if(tch_off == 0) {
scalar_t reduce_sum = 0;
for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
reduce_sum += prod_sum[idx];
}
const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
gradInput1[indx1] = reduce_sum / nelems;
}
}
template
__global__ void correlation_backward_input2(int item, scalar_t* gradInput2, int nInputChannels, int inputHeight, int inputWidth,
const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
const scalar_t* __restrict__ rInput1,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2)
{
// n (batch size), c (num of channels), y (height), x (width)
int n = item;
int y = blockIdx.x * stride1 + pad_size;
int x = blockIdx.y * stride1 + pad_size;
int c = blockIdx.z;
int tch_off = threadIdx.x;
int kernel_rad = (kernel_size - 1) / 2;
int displacement_rad = max_displacement / stride2;
int displacement_size = 2 * displacement_rad + 1;
int pInputWidth = inputWidth + 2 * pad_size;
int pInputHeight = inputHeight + 2 * pad_size;
int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
int pdimxc = pInputWidth * nInputChannels;
int pdimc = nInputChannels;
int tdimcyx = nOutputChannels * outputHeight * outputWidth;
int tdimyx = outputHeight * outputWidth;
int tdimx = outputWidth;
int odimcyx = nInputChannels * inputHeight* inputWidth;
int odimyx = inputHeight * inputWidth;
int odimx = inputWidth;
scalar_t nelems = kernel_size * kernel_size * nInputChannels;
__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
prod_sum[tch_off] = 0;
for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
int i2 = (tc % displacement_size - displacement_rad) * stride2;
int j2 = (tc / displacement_size - displacement_rad) * stride2;
int xmin = (x - kernel_rad - max_displacement - i2) / stride1;
int ymin = (y - kernel_rad - max_displacement - j2) / stride1;
int xmax = (x + kernel_rad - max_displacement - i2) / stride1;
int ymax = (y + kernel_rad - max_displacement - j2) / stride1;
if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
// assumes gradInput2 is pre-allocated and zero filled
continue;
}
if (xmin > xmax || ymin > ymax) {
// assumes gradInput2 is pre-allocated and zero filled
continue;
}
xmin = max(0,xmin);
xmax = min(outputWidth-1,xmax);
ymin = max(0,ymin);
ymax = min(outputHeight-1,ymax);
int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;
scalar_t val1 = rInput1[indx1];
for (int j = ymin; j <= ymax; ++j) {
for (int i = xmin; i <= xmax; ++i) {
int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
prod_sum[tch_off] += gradOutput[tindx] * val1;
}
}
}
__syncthreads();
if(tch_off == 0) {
scalar_t reduce_sum = 0;
for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
reduce_sum += prod_sum[idx];
}
const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
gradInput2[indx2] = reduce_sum / nelems;
}
}
int correlation_forward_cuda_kernel(at::Tensor& output,
int ob,
int oc,
int oh,
int ow,
int osb,
int osc,
int osh,
int osw,
at::Tensor& input1,
int ic,
int ih,
int iw,
int isb,
int isc,
int ish,
int isw,
at::Tensor& input2,
int gc,
int gsb,
int gsc,
int gsh,
int gsw,
at::Tensor& rInput1,
at::Tensor& rInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply,
cudaStream_t stream)
{
int batchSize = ob;
int nInputChannels = ic;
int inputWidth = iw;
int inputHeight = ih;
int nOutputChannels = oc;
int outputWidth = ow;
int outputHeight = oh;
dim3 blocks_grid(batchSize, inputHeight, inputWidth);
dim3 threads_block(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] {
channels_first<<>>(
input1.data(), rInput1.data(), nInputChannels, inputHeight, inputWidth, pad_size);
}));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] {
channels_first<<>> (
input2.data(), rInput2.data(), nInputChannels, inputHeight, inputWidth, pad_size);
}));
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] {
correlation_forward<<>>
(output.data(), nOutputChannels, outputHeight, outputWidth,
rInput1.data(), nInputChannels, inputHeight, inputWidth,
rInput2.data(),
pad_size,
kernel_size,
max_displacement,
stride1,
stride2);
}));
cudaError_t err = cudaGetLastError();
// check for errors
if (err != cudaSuccess) {
printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err));
return 0;
}
return 1;
}
int correlation_backward_cuda_kernel(
at::Tensor& gradOutput,
int gob,
int goc,
int goh,
int gow,
int gosb,
int gosc,
int gosh,
int gosw,
at::Tensor& input1,
int ic,
int ih,
int iw,
int isb,
int isc,
int ish,
int isw,
at::Tensor& input2,
int gsb,
int gsc,
int gsh,
int gsw,
at::Tensor& gradInput1,
int gisb,
int gisc,
int gish,
int gisw,
at::Tensor& gradInput2,
int ggc,
int ggsb,
int ggsc,
int ggsh,
int ggsw,
at::Tensor& rInput1,
at::Tensor& rInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply,
cudaStream_t stream)
{
int batchSize = gob;
int num = batchSize;
int nInputChannels = ic;
int inputWidth = iw;
int inputHeight = ih;
int nOutputChannels = goc;
int outputWidth = gow;
int outputHeight = goh;
dim3 blocks_grid(batchSize, inputHeight, inputWidth);
dim3 threads_block(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] {
channels_first<<>>(
input1.data(),
rInput1.data(),
nInputChannels,
inputHeight,
inputWidth,
pad_size
);
}));
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {
channels_first<<>>(
input2.data(),
rInput2.data(),
nInputChannels,
inputHeight,
inputWidth,
pad_size
);
}));
dim3 threadsPerBlock(THREADS_PER_BLOCK);
dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);
for (int n = 0; n < num; ++n) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {
correlation_backward_input1<<>> (
n, gradInput1.data(), nInputChannels, inputHeight, inputWidth,
gradOutput.data(), nOutputChannels, outputHeight, outputWidth,
rInput2.data(),
pad_size,
kernel_size,
max_displacement,
stride1,
stride2);
}));
}
for(int n = 0; n < batchSize; n++) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] {
correlation_backward_input2<<>>(
n, gradInput2.data(), nInputChannels, inputHeight, inputWidth,
gradOutput.data(), nOutputChannels, outputHeight, outputWidth,
rInput1.data(),
pad_size,
kernel_size,
max_displacement,
stride1,
stride2);
}));
}
// check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err));
return 0;
}
return 1;
}
================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int correlation_forward_cuda_kernel(at::Tensor& output,
int ob,
int oc,
int oh,
int ow,
int osb,
int osc,
int osh,
int osw,
at::Tensor& input1,
int ic,
int ih,
int iw,
int isb,
int isc,
int ish,
int isw,
at::Tensor& input2,
int gc,
int gsb,
int gsc,
int gsh,
int gsw,
at::Tensor& rInput1,
at::Tensor& rInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply,
cudaStream_t stream);
int correlation_backward_cuda_kernel(
at::Tensor& gradOutput,
int gob,
int goc,
int goh,
int gow,
int gosb,
int gosc,
int gosh,
int gosw,
at::Tensor& input1,
int ic,
int ih,
int iw,
int isb,
int isc,
int ish,
int isw,
at::Tensor& input2,
int gsb,
int gsc,
int gsh,
int gsw,
at::Tensor& gradInput1,
int gisb,
int gisc,
int gish,
int gisw,
at::Tensor& gradInput2,
int ggc,
int ggsb,
int ggsc,
int ggsh,
int ggsw,
at::Tensor& rInput1,
at::Tensor& rInput2,
int pad_size,
int kernel_size,
int max_displacement,
int stride1,
int stride2,
int corr_type_multiply,
cudaStream_t stream);
================================================
FILE: PWCNet/correlation_package_pytorch1_0/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='correlation_cuda',
ext_modules=[
CUDAExtension('correlation_cuda', [
'correlation_cuda.cc',
'correlation_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: PWCNet/models/PWCNet.py
================================================
"""
implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018
Jinwei Gu and Zhile Ren
"""
import torch
import torch.nn as nn
from torch.autograd import Variable
import os
os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory
from correlation_package.modules.corr import Correlation
import numpy as np
__all__ = [
'pwc_dc_net', 'pwc_dc_net_old'
]
def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
return nn.Sequential(
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, bias=True),
nn.LeakyReLU(0.1))
def predict_flow(in_planes):
return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)
def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)
class PWCDCNet(nn.Module):
"""
PWC-DC net. add dilation convolution and densenet connections
"""
def __init__(self, md=4):
"""
input: md --- maximum displacement (for correlation. default: 4), after warpping
"""
super(PWCDCNet,self).__init__()
self.conv1a = conv(3, 16, kernel_size=3, stride=2)
self.conv1aa = conv(16, 16, kernel_size=3, stride=1)
self.conv1b = conv(16, 16, kernel_size=3, stride=1)
self.conv2a = conv(16, 32, kernel_size=3, stride=2)
self.conv2aa = conv(32, 32, kernel_size=3, stride=1)
self.conv2b = conv(32, 32, kernel_size=3, stride=1)
self.conv3a = conv(32, 64, kernel_size=3, stride=2)
self.conv3aa = conv(64, 64, kernel_size=3, stride=1)
self.conv3b = conv(64, 64, kernel_size=3, stride=1)
self.conv4a = conv(64, 96, kernel_size=3, stride=2)
self.conv4aa = conv(96, 96, kernel_size=3, stride=1)
self.conv4b = conv(96, 96, kernel_size=3, stride=1)
self.conv5a = conv(96, 128, kernel_size=3, stride=2)
self.conv5aa = conv(128,128, kernel_size=3, stride=1)
self.conv5b = conv(128,128, kernel_size=3, stride=1)
self.conv6aa = conv(128,196, kernel_size=3, stride=2)
self.conv6a = conv(196,196, kernel_size=3, stride=1)
self.conv6b = conv(196,196, kernel_size=3, stride=1)
self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
self.leakyRELU = nn.LeakyReLU(0.1)
nd = (2*md+1)**2
dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)
dd = [int(d) for d in dd]
od = nd
self.conv6_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow6 = predict_flow(od+dd[4])
self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+128+4
self.conv5_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow5 = predict_flow(od+dd[4])
self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+96+4
self.conv4_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow4 = predict_flow(od+dd[4])
self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+64+4
self.conv3_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow3 = predict_flow(od+dd[4])
self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+32+4
self.conv2_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow2 = predict_flow(od+dd[4])
self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2)
self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4)
self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8)
self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16)
self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv7 = predict_flow(32)
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal(m.weight.data, mode='fan_in')
if m.bias is not None:
m.bias.data.zero_()
def warp(self, x, flo):
"""
warp an image/tensor (im2) back to im1, according to the optical flow
x: [B, C, H, W] (im2)
flo: [B, 2, H, W] flow
"""
B, C, H, W = x.size()
# mesh grid
xx = torch.arange(0, W).view(1,-1).repeat(H,1)
yy = torch.arange(0, H).view(-1,1).repeat(1,W)
xx = xx.view(1,1,H,W).repeat(B,1,1,1)
yy = yy.view(1,1,H,W).repeat(B,1,1,1)
grid = torch.cat((xx,yy),1).float()
if x.is_cuda:
grid = grid.cuda()
vgrid = Variable(grid) + flo
# scale grid to [-1,1]
vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0
vgrid = vgrid.permute(0,2,3,1)
output = nn.functional.grid_sample(x, vgrid)
mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
mask = nn.functional.grid_sample(mask, vgrid)
# if W==128:
# np.save('mask.npy', mask.cpu().data.numpy())
# np.save('warp.npy', output.cpu().data.numpy())
mask[mask<0.9999] = 0
mask[mask>0] = 1
return output*mask
def forward(self,x):
im1 = x[:,:3,:,:]
im2 = x[:,3:,:,:]
c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))
c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))
c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))
c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))
c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))
c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))
c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))
c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))
c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))
c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))
c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))
c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))
corr6 = self.corr(c16, c26)
corr6 = self.leakyRELU(corr6)
x = torch.cat((self.conv6_0(corr6), corr6),1)
x = torch.cat((self.conv6_1(x), x),1)
x = torch.cat((self.conv6_2(x), x),1)
x = torch.cat((self.conv6_3(x), x),1)
x = torch.cat((self.conv6_4(x), x),1)
flow6 = self.predict_flow6(x)
up_flow6 = self.deconv6(flow6)
up_feat6 = self.upfeat6(x)
warp5 = self.warp(c25, up_flow6*0.625)
corr5 = self.corr(c15, warp5)
corr5 = self.leakyRELU(corr5)
x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
x = torch.cat((self.conv5_0(x), x),1)
x = torch.cat((self.conv5_1(x), x),1)
x = torch.cat((self.conv5_2(x), x),1)
x = torch.cat((self.conv5_3(x), x),1)
x = torch.cat((self.conv5_4(x), x),1)
flow5 = self.predict_flow5(x)
up_flow5 = self.deconv5(flow5)
up_feat5 = self.upfeat5(x)
warp4 = self.warp(c24, up_flow5*1.25)
corr4 = self.corr(c14, warp4)
corr4 = self.leakyRELU(corr4)
x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
x = torch.cat((self.conv4_0(x), x),1)
x = torch.cat((self.conv4_1(x), x),1)
x = torch.cat((self.conv4_2(x), x),1)
x = torch.cat((self.conv4_3(x), x),1)
x = torch.cat((self.conv4_4(x), x),1)
flow4 = self.predict_flow4(x)
up_flow4 = self.deconv4(flow4)
up_feat4 = self.upfeat4(x)
warp3 = self.warp(c23, up_flow4*2.5)
corr3 = self.corr(c13, warp3)
corr3 = self.leakyRELU(corr3)
x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
x = torch.cat((self.conv3_0(x), x),1)
x = torch.cat((self.conv3_1(x), x),1)
x = torch.cat((self.conv3_2(x), x),1)
x = torch.cat((self.conv3_3(x), x),1)
x = torch.cat((self.conv3_4(x), x),1)
flow3 = self.predict_flow3(x)
up_flow3 = self.deconv3(flow3)
up_feat3 = self.upfeat3(x)
warp2 = self.warp(c22, up_flow3*5.0)
corr2 = self.corr(c12, warp2)
corr2 = self.leakyRELU(corr2)
x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
x = torch.cat((self.conv2_0(x), x),1)
x = torch.cat((self.conv2_1(x), x),1)
x = torch.cat((self.conv2_2(x), x),1)
x = torch.cat((self.conv2_3(x), x),1)
x = torch.cat((self.conv2_4(x), x),1)
flow2 = self.predict_flow2(x)
x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
if self.training:
return flow2,flow3,flow4,flow5,flow6
else:
return flow2
class PWCDCNet_old(nn.Module):
"""
PWC-DC net. add dilation convolution and densenet connections
"""
def __init__(self, md=4):
"""
input: md --- maximum displacement (for correlation. default: 4), after warpping
"""
super(PWCDCNet_old,self).__init__()
self.conv1a = conv(3, 16, kernel_size=3, stride=2)
self.conv1b = conv(16, 16, kernel_size=3, stride=1)
self.conv2a = conv(16, 32, kernel_size=3, stride=2)
self.conv2b = conv(32, 32, kernel_size=3, stride=1)
self.conv3a = conv(32, 64, kernel_size=3, stride=2)
self.conv3b = conv(64, 64, kernel_size=3, stride=1)
self.conv4a = conv(64, 96, kernel_size=3, stride=2)
self.conv4b = conv(96, 96, kernel_size=3, stride=1)
self.conv5a = conv(96, 128, kernel_size=3, stride=2)
self.conv5b = conv(128,128, kernel_size=3, stride=1)
self.conv6a = conv(128,196, kernel_size=3, stride=2)
self.conv6b = conv(196,196, kernel_size=3, stride=1)
self.corr = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
self.leakyRELU = nn.LeakyReLU(0.1)
nd = (2*md+1)**2
dd = np.cumsum([128,128,96,64,32])
od = nd
self.conv6_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv6_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv6_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv6_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow6 = predict_flow(od+dd[4])
self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+128+4
self.conv5_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv5_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv5_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv5_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow5 = predict_flow(od+dd[4])
self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+96+4
self.conv4_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv4_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv4_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv4_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow4 = predict_flow(od+dd[4])
self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+64+4
self.conv3_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv3_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv3_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv3_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow3 = predict_flow(od+dd[4])
self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1)
od = nd+32+4
self.conv2_0 = conv(od, 128, kernel_size=3, stride=1)
self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
self.conv2_2 = conv(od+dd[1],96, kernel_size=3, stride=1)
self.conv2_3 = conv(od+dd[2],64, kernel_size=3, stride=1)
self.conv2_4 = conv(od+dd[3],32, kernel_size=3, stride=1)
self.predict_flow2 = predict_flow(od+dd[4])
self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1)
self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv2 = conv(128, 128, kernel_size=3, stride=1, padding=2, dilation=2)
self.dc_conv3 = conv(128, 128, kernel_size=3, stride=1, padding=4, dilation=4)
self.dc_conv4 = conv(128, 96, kernel_size=3, stride=1, padding=8, dilation=8)
self.dc_conv5 = conv(96, 64, kernel_size=3, stride=1, padding=16, dilation=16)
self.dc_conv6 = conv(64, 32, kernel_size=3, stride=1, padding=1, dilation=1)
self.dc_conv7 = predict_flow(32)
for m in self.modules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal(m.weight.data, mode='fan_in')
if m.bias is not None:
m.bias.data.zero_()
def warp(self, x, flo):
"""
warp an image/tensor (im2) back to im1, according to the optical flow
x: [B, C, H, W] (im2)
flo: [B, 2, H, W] flow
"""
B, C, H, W = x.size()
# mesh grid
xx = torch.arange(0, W).view(1,-1).repeat(H,1)
yy = torch.arange(0, H).view(-1,1).repeat(1,W)
xx = xx.view(1,1,H,W).repeat(B,1,1,1)
yy = yy.view(1,1,H,W).repeat(B,1,1,1)
grid = torch.cat((xx,yy),1).float()
if x.is_cuda:
grid = grid.cuda()
vgrid = Variable(grid) + flo
# scale grid to [-1,1]
vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0
vgrid = vgrid.permute(0,2,3,1)
output = nn.functional.grid_sample(x, vgrid)
mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
mask = nn.functional.grid_sample(mask, vgrid)
mask[mask<0.999] = 0
mask[mask>0] = 1
return output*mask
def forward(self,x):
im1 = x[:,:3,:,:]
im2 = x[:,3:,:,:]
c11 = self.conv1b(self.conv1a(im1))
c21 = self.conv1b(self.conv1a(im2))
c12 = self.conv2b(self.conv2a(c11))
c22 = self.conv2b(self.conv2a(c21))
c13 = self.conv3b(self.conv3a(c12))
c23 = self.conv3b(self.conv3a(c22))
c14 = self.conv4b(self.conv4a(c13))
c24 = self.conv4b(self.conv4a(c23))
c15 = self.conv5b(self.conv5a(c14))
c25 = self.conv5b(self.conv5a(c24))
c16 = self.conv6b(self.conv6a(c15))
c26 = self.conv6b(self.conv6a(c25))
corr6 = self.corr(c16, c26)
corr6 = self.leakyRELU(corr6)
x = torch.cat((corr6, self.conv6_0(corr6)),1)
x = torch.cat((self.conv6_1(x), x),1)
x = torch.cat((x, self.conv6_2(x)),1)
x = torch.cat((x, self.conv6_3(x)),1)
x = torch.cat((x, self.conv6_4(x)),1)
flow6 = self.predict_flow6(x)
up_flow6 = self.deconv6(flow6)
up_feat6 = self.upfeat6(x)
warp5 = self.warp(c25, up_flow6*0.625)
corr5 = self.corr(c15, warp5)
corr5 = self.leakyRELU(corr5)
x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
x = torch.cat((x, self.conv5_0(x)),1)
x = torch.cat((self.conv5_1(x), x),1)
x = torch.cat((x, self.conv5_2(x)),1)
x = torch.cat((x, self.conv5_3(x)),1)
x = torch.cat((x, self.conv5_4(x)),1)
flow5 = self.predict_flow5(x)
up_flow5 = self.deconv5(flow5)
up_feat5 = self.upfeat5(x)
warp4 = self.warp(c24, up_flow5*1.25)
corr4 = self.corr(c14, warp4)
corr4 = self.leakyRELU(corr4)
x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
x = torch.cat((x, self.conv4_0(x)),1)
x = torch.cat((self.conv4_1(x), x),1)
x = torch.cat((x, self.conv4_2(x)),1)
x = torch.cat((x, self.conv4_3(x)),1)
x = torch.cat((x, self.conv4_4(x)),1)
flow4 = self.predict_flow4(x)
up_flow4 = self.deconv4(flow4)
up_feat4 = self.upfeat4(x)
warp3 = self.warp(c23, up_flow4*2.5)
corr3 = self.corr(c13, warp3)
corr3 = self.leakyRELU(corr3)
x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
x = torch.cat((x, self.conv3_0(x)),1)
x = torch.cat((self.conv3_1(x), x),1)
x = torch.cat((x, self.conv3_2(x)),1)
x = torch.cat((x, self.conv3_3(x)),1)
x = torch.cat((x, self.conv3_4(x)),1)
flow3 = self.predict_flow3(x)
up_flow3 = self.deconv3(flow3)
up_feat3 = self.upfeat3(x)
warp2 = self.warp(c22, up_flow3*5.0)
corr2 = self.corr(c12, warp2)
corr2 = self.leakyRELU(corr2)
x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
x = torch.cat((x, self.conv2_0(x)),1)
x = torch.cat((self.conv2_1(x), x),1)
x = torch.cat((x, self.conv2_2(x)),1)
x = torch.cat((x, self.conv2_3(x)),1)
x = torch.cat((x, self.conv2_4(x)),1)
flow2 = self.predict_flow2(x)
x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
if self.training:
return flow2,flow3,flow4,flow5,flow6
else:
return flow2
def pwc_dc_net(path=None):
model = PWCDCNet()
if path is not None:
data = torch.load(path)
if 'state_dict' in data.keys():
model.load_state_dict(data['state_dict'])
else:
model.load_state_dict(data)
return model
def pwc_dc_net_old(path=None):
model = PWCDCNet_old()
if path is not None:
data = torch.load(path)
if 'state_dict' in data.keys():
model.load_state_dict(data['state_dict'])
else:
model.load_state_dict(data)
return model
================================================
FILE: PWCNet/models/__init__.py
================================================
from .PWCNet import *
================================================
FILE: README.md
================================================
# DAIN (Depth-Aware Video Frame Interpolation)
[Project](https://sites.google.com/view/wenbobao/dain) **|** [Paper](http://arxiv.org/abs/1904.00830)
[Wenbo Bao](https://sites.google.com/view/wenbobao/home),
[Wei-Sheng Lai](http://graduatestudents.ucmerced.edu/wlai24/),
[Chao Ma](https://sites.google.com/site/chaoma99/),
Xiaoyun Zhang,
Zhiyong Gao,
and [Ming-Hsuan Yang](http://faculty.ucmerced.edu/mhyang/)
IEEE Conference on Computer Vision and Pattern Recognition, Long Beach, CVPR 2019
This work is developed based on our TPAMI work [MEMC-Net](https://github.com/baowenbo/MEMC-Net), where we propose the adaptive warping layer. Please also consider referring to it.
### Table of Contents
1. [Introduction](#introduction)
1. [Citation](#citation)
1. [Requirements and Dependencies](#requirements-and-dependencies)
1. [Installation](#installation)
1. [Testing Pre-trained Models](#testing-pre-trained-models)
1. [Downloading Results](#downloading-results)
1. [Slow-motion Generation](#slow-motion-generation)
1. [Training New Models](#training-new-models)
1. [Google Colab Demo](#google-colab-demo)
### Introduction
We propose the **D**epth-**A**ware video frame **IN**terpolation (**DAIN**) model to explicitly detect the occlusion by exploring the depth cue.
We develop a depth-aware flow projection layer to synthesize intermediate flows that preferably sample closer objects than farther ones.
Our method achieves state-of-the-art performance on the Middlebury dataset.
We provide videos [here](https://www.youtube.com/watch?v=-f8f0igQi5I&t=5s).
### Citation
If you find the code and datasets useful in your research, please cite:
@inproceedings{DAIN,
author = {Bao, Wenbo and Lai, Wei-Sheng and Ma, Chao and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan},
title = {Depth-Aware Video Frame Interpolation},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2019}
}
@article{MEMC-Net,
title={MEMC-Net: Motion Estimation and Motion Compensation Driven Neural Network for Video Interpolation and Enhancement},
author={Bao, Wenbo and Lai, Wei-Sheng, and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
doi={10.1109/TPAMI.2019.2941941},
year={2018}
}
### Requirements and Dependencies
- Ubuntu (We test with Ubuntu = 16.04.5 LTS)
- Python (We test with Python = 3.6.8 in Anaconda3 = 4.1.1)
- Cuda & Cudnn (We test with Cuda = 9.0 and Cudnn = 7.0)
- PyTorch (The customized depth-aware flow projection and other layers require ATen API in PyTorch = 1.0.0)
- GCC (Compiling PyTorch 1.0.0 extension files (.c/.cu) requires gcc = 4.9.1 and nvcc = 9.0 compilers)
- NVIDIA GPU (We use Titan X (Pascal) with compute = 6.1, but we support compute_50/52/60/61 devices, should you have devices with higher compute capability, please revise [this](https://github.com/baowenbo/DAIN/blob/master/my_package/DepthFlowProjection/setup.py))
### Installation
Download repository:
$ git clone https://github.com/baowenbo/DAIN.git
Before building Pytorch extensions, be sure you have `pytorch >= 1.0.0`:
$ python -c "import torch; print(torch.__version__)"
Generate our PyTorch extensions:
$ cd DAIN
$ cd my_package
$ ./build.sh
Generate the Correlation package required by [PWCNet](https://github.com/NVlabs/PWC-Net/tree/master/PyTorch/external_packages/correlation-pytorch-master):
$ cd ../PWCNet/correlation_package_pytorch1_0
$ ./build.sh
### Testing Pre-trained Models
Make model weights dir and Middlebury dataset dir:
$ cd DAIN
$ mkdir model_weights
$ mkdir MiddleBurySet
Download pretrained models,
$ cd model_weights
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth
and Middlebury dataset:
$ cd ../MiddleBurySet
$ wget http://vision.middlebury.edu/flow/data/comp/zip/other-color-allframes.zip
$ unzip other-color-allframes.zip
$ wget http://vision.middlebury.edu/flow/data/comp/zip/other-gt-interp.zip
$ unzip other-gt-interp.zip
$ cd ..
preinstallations:
$ cd PWCNet/correlation_package_pytorch1_0
$ sh build.sh
$ cd ../my_package
$ sh build.sh
$ cd ..
We are good to go by:
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py
The interpolated results are under `MiddleBurySet/other-result-author/[random number]/`, where the `random number` is used to distinguish different runnings.
### Downloading Results
Our DAIN model achieves the state-of-the-art performance on the UCF101, Vimeo90K, and Middlebury ([*eval*](http://vision.middlebury.edu/flow/eval/results/results-n1.php) and *other*).
Download our interpolated results with:
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/UCF101_DAIN.zip
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Vimeo90K_interp_DAIN.zip
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_eval_DAIN.zip
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_other_DAIN.zip
### Slow-motion Generation
Our model is fully capable of generating slow-motion effect with minor modification on the network architecture.
Run the following code by specifying `time_step = 0.25` to generate x4 slow-motion effect:
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.25
or set `time_step` to `0.125` or `0.1` as follows
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.125
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.1
to generate x8 and x10 slow-motion respectively. Or if you would like to have x100 slow-motion for a little fun.
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.01
You may also want to create gif animations by:
$ cd MiddleBurySet/other-result-author/[random number]/Beanbags
$ convert -delay 1 *.png -loop 0 Beanbags.gif //1*10ms delay
Have fun and enjoy yourself!
### Training New Models
Download the Vimeo90K triplet dataset for video frame interpolation task, also see [here](https://github.com/anchen1011/toflow/blob/master/download_dataset.sh) by [Xue et al., IJCV19](https://arxiv.org/abs/1711.09078).
$ cd DAIN
$ mkdir /path/to/your/dataset & cd /path/to/your/dataset
$ wget http://data.csail.mit.edu/tofu/dataset/vimeo_triplet.zip
$ unzip vimeo_triplet.zip
$ rm vimeo_triplet.zip
Download the pretrained MegaDepth and PWCNet models
$ cd MegaDepth/checkpoints/test_local
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best_generalization_net_G.pth
$ cd ../../../PWCNet
$ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/pwc_net.pth.tar
$ cd ..
Run the training script:
$ CUDA_VISIBLE_DEVICES=0 python train.py --datasetPath /path/to/your/dataset --batch_size 1 --save_which 1 --lr 0.0005 --rectify_lr 0.0005 --flow_lr_coe 0.01 --occ_lr_coe 0.0 --filter_lr_coe 1.0 --ctx_lr_coe 1.0 --alpha 0.0 1.0 --patience 4 --factor 0.2
The optimized models will be saved to the `model_weights/[random number]` directory, where [random number] is generated for different runs.
Replace the pre-trained `model_weights/best.pth` model with the newly trained `model_weights/[random number]/best.pth` model.
Then test the new model by executing:
$ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py
### Google Colab Demo
This is a modification of DAIN that allows the usage of Google Colab and is able to do a full demo interpolation from a source video to a target video.
Original Notebook File by btahir can be found [here](https://github.com/baowenbo/DAIN/issues/44).
To use the Colab, follow these steps:
- Download the `Colab_DAIN.ipynb` file ([link](https://raw.githubusercontent.com/baowenbo/DAIN/master/Colab_DAIN.ipynb)).
- Visit Google Colaboratory ([link](https://colab.research.google.com/))
- Select the "Upload" option, and upload the `.ipynb` file
- Start running the cells one by one, following the instructions.
Colab file authors: [Styler00Dollar](https://github.com/styler00dollar) and [Alpha](https://github.com/AlphaGit).
### Contact
[Wenbo Bao](mailto:bwb0813@gmail.com); [Wei-Sheng (Jason) Lai](mailto:phoenix104104@gmail.com)
### License
See [MIT License](https://github.com/baowenbo/DAIN/blob/master/LICENSE)
================================================
FILE: Resblock/BasicBlock.py
================================================
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch.nn.init as weight_init
import torch
__all__ = ['MultipleBasicBlock','MultipleBasicBlock_4']
def conv3x3(in_planes, out_planes, dilation = 1, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes,dilation, stride)
# self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
# self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
# weight_init.xavier_normal()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
residual = x
out = self.conv1(x)
# out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
# out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class MultipleBasicBlock(nn.Module):
def __init__(self,input_feature,
block, num_blocks,
intermediate_feature = 64, dense = True):
super(MultipleBasicBlock, self).__init__()
self.dense = dense
self.num_block = num_blocks
self.intermediate_feature = intermediate_feature
self.block1= nn.Sequential(*[
nn.Conv2d(input_feature, intermediate_feature,
kernel_size=7, stride=1, padding=3, bias=True),
nn.ReLU(inplace=True)
])
# for i in range(1, num_blocks):
self.block2 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=2 else None
self.block3 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=3 else None
self.block4 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=4 else None
self.block5 = nn.Sequential(*[nn.Conv2d(intermediate_feature, 3 , (3, 3), 1, (1, 1))])
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
x = self.block1(x)
x = self.block2(x) if self.num_block>=2 else x
x = self.block3(x) if self.num_block>=3 else x
x = self.block4(x) if self.num_block== 4 else x
x = self.block5(x)
return x
def MultipleBasicBlock_4(input_feature,intermediate_feature = 64):
model = MultipleBasicBlock(input_feature,
BasicBlock,4 ,
intermediate_feature)
return model
if __name__ == '__main__':
# x= Variable(torch.randn(2,3,224,448))
# model = S2DF(BasicBlock,3,True)
# y = model(x)
model = MultipleBasicBlock(200, BasicBlock,4)
model = BasicBlock(64,64,1)
# y = model(x)
exit(0)
================================================
FILE: Resblock/__init__.py
================================================
from .BasicBlock import *
================================================
FILE: S2D_models/S2DF.py
================================================
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch
# __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
# 'resnet152','resnet18_conv1']
__all__ = ['S2DF','S2DF_3dense','S2DF_3dense_nodilation',
'S2DF_3last','S2DF_2dense', 'BasicBlock']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, dilation = 1, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes,dilation, stride)
# self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
# self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
# out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
# out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
# self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=int(dilation*(3-1)/2), dilation = dilation, bias=False)
# self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
# self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
# out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
# out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
# out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class S2DF(nn.Module):
def __init__(self, block, num_blocks,dense = True,dilation=True):
self.inplanes = 64
super(S2DF, self).__init__()
self.dense = dense
self.num_block = num_blocks
assert(num_blocks>=1 and num_blocks<=4)
self.block1 = nn.Sequential(*[
nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),
nn.ReLU(inplace=True)
])
self.dilation = dilation
# for i in range(1, num_blocks):
self.block2 = block(self.inplanes, 64, dilation = 4 if dilation else 1) if num_blocks>=2 else None
self.block3 = block(self.inplanes, 64, dilation = 8 if dilation else 1) if num_blocks>=3 else None
self.block4 = block(self.inplanes, 64, dilation = 16 if dilation else 1) if num_blocks>=4 else None
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
y = []
y.append(x) #raw feature
x = self.block1(x)
if (self.num_block > 1 and self.dense) or self.num_block == 1:
y.append(x)
x = self.block2(x) if self.num_block>=2 else x
if (self.num_block > 2 and self.dense) or self.num_block == 2:
y.append(x)
x = self.block3(x) if self.num_block>=3 else x
if (self.num_block > 3 and self.dense) or self.num_block == 3:
y.append(x)
x = self.block4(x) if self.num_block== 4 else x
if self.num_block == 4 :
y.append(x)
return torch.cat(y,dim=1)
class S2DFsim(nn.Module):
def __init__(self, block, num_blocks,dense = True,dilation=True):
self.inplanes = 64
super(S2DFsim, self).__init__()
self.dense = dense
self.num_block = num_blocks
assert(num_blocks>=1 and num_blocks<=4)
self.block1 = nn.Sequential(*[
nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
])
self.dilation = dilation
# for i in range(1, num_blocks):
self.block2 = nn.Sequential(*[
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
]) if num_blocks >= 2 else None
self.block3 = nn.Sequential(*[
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
]) if num_blocks >= 3 else None
self.block4 = nn.Sequential(*[
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
]) if num_blocks >= 4 else None
# for m in self.modules():
# if isinstance(m, nn.Conv2d):
# n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
# m.weight.data.normal_(0, math.sqrt(2. / n))
# elif isinstance(m, nn.BatchNorm2d):
# m.weight.data.fill_(1)
# m.bias.data.zero_()
def forward(self, x):
y = []
y.append(x) #raw feature
x = self.block1(x)
if (self.num_block > 1 and self.dense) or self.num_block == 1:
y.append(x)
x = self.block2(x) if self.num_block>=2 else x
if (self.num_block > 2 and self.dense) or self.num_block == 2:
y.append(x)
x = self.block3(x) if self.num_block>=3 else x
if (self.num_block > 3 and self.dense) or self.num_block == 3:
y.append(x)
x = self.block4(x) if self.num_block== 4 else x
if self.num_block == 4 :
y.append(x)
return torch.cat(y,dim=1)
def S2DF_3dense_nodilation():
model = S2DFsim(None,3,dense=True,dilation=False)
return model
def S2DF_3dense():
model = S2DF(BasicBlock,3,dense=True)
return model
def S2DF_3last():
model = S2DF(BasicBlock,3,dense=False)
return model
def S2DF_2dense():
model = S2DF(BasicBlock,2,dense=True)
return model
from torch.autograd import Variable
if __name__ == '__main__':
x= Variable(torch.randn(2,3,224,448))
# model = S2DF(BasicBlock,3,True)
# y = model(x)
model = S2DF(BasicBlock,4,False)
y = model(x)
exit(0)
================================================
FILE: S2D_models/__init__.py
================================================
from .S2DF import *
================================================
FILE: Stack.py
================================================
class Stack:
def __init__(self):
self.stack = []
def pop(self):
if self.is_empty():
return None
else:
return self.stack.pop()
def push(self,val):
return self.stack.append(val)
def peak(self):
if self.is_empty():
return None
else:
return self.stack[-1]
def size(self):
return len(self.stack)
def is_empty(self):
return self.size() == 0
================================================
FILE: balancedsampler.py
================================================
from torch.utils.data.sampler import Sampler
import torch
class RandomBalancedSampler(Sampler):
"""Samples elements randomly, with an arbitrary size, independant from dataset length.
this is a balanced sampling that will sample the whole dataset with a random permutation.
Arguments:
data_source (Dataset): dataset to sample from
"""
def __init__(self, data_source, epoch_size):
self.data_size = len(data_source)
self.epoch_size = epoch_size
self.index = 0
def __next__(self):
if self.index == 0:
#re-shuffle the sampler
self.indices = torch.randperm(self.data_size)
self.index = (self.index+1)%self.data_size
return self.indices[self.index]
def next(self):
return self.__next__()
def __iter__(self):
return self
def __len__(self):
return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size
class SequentialBalancedSampler(Sampler):
"""Samples elements dequentially, with an arbitrary size, independant from dataset length.
this is a balanced sampling that will sample the whole dataset before resetting it.
Arguments:
data_source (Dataset): dataset to sample from
"""
def __init__(self, data_source, epoch_size):
self.data_size = len(data_source)
self.epoch_size = epoch_size
self.index = 0
def __next__(self):
self.index = (self.index+1)%self.data_size
return self.index
def next(self):
return self.__next__()
def __iter__(self):
return self
def __len__(self):
return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size
================================================
FILE: colab_interpolate.py
================================================
import time
import os
from torch.autograd import Variable
import torch
import numpy as np
import numpy
import networks
from my_args import args
from imageio import imread, imsave
from AverageMeter import *
import shutil
import datetime
torch.backends.cudnn.benchmark = True
model = networks.__dict__[args.netName](
channel = args.channels,
filter_size = args.filter_size,
timestep = args.time_step,
training = False)
if args.use_cuda:
model = model.cuda()
model_path = './model_weights/best.pth'
if not os.path.exists(model_path):
print("*****************************************************************")
print("**** We couldn't load any trained weights ***********************")
print("*****************************************************************")
exit(1)
if args.use_cuda:
pretrained_dict = torch.load(model_path)
else:
pretrained_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
model_dict = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
# 4. release the pretrained dict for saving memory
pretrained_dict = []
model = model.eval() # deploy mode
frames_dir = args.frame_input_dir
output_dir = args.frame_output_dir
timestep = args.time_step
time_offsets = [kk * timestep for kk in range(1, int(1.0 / timestep))]
input_frame = args.start_frame - 1
loop_timer = AverageMeter()
final_frame = args.end_frame
torch.set_grad_enabled(False)
# we want to have input_frame between (start_frame-1) and (end_frame-2)
# this is because at each step we read (frame) and (frame+1)
# so the last iteration will actuall be (end_frame-1) and (end_frame)
while input_frame < final_frame - 1:
input_frame += 1
start_time = time.time()
filename_frame_1 = os.path.join(frames_dir, f'{input_frame:0>5d}.png')
filename_frame_2 = os.path.join(frames_dir, f'{input_frame+1:0>5d}.png')
X0 = torch.from_numpy(np.transpose(imread(filename_frame_1), (2,0,1)).astype("float32") / 255.0).type(args.dtype)
X1 = torch.from_numpy(np.transpose(imread(filename_frame_2), (2,0,1)).astype("float32") / 255.0).type(args.dtype)
assert (X0.size(1) == X1.size(1))
assert (X0.size(2) == X1.size(2))
intWidth = X0.size(2)
intHeight = X0.size(1)
channels = X0.size(0)
if not channels == 3:
print(f"Skipping {filename_frame_1}-{filename_frame_2} -- expected 3 color channels but found {channels}.")
continue
if intWidth != ((intWidth >> 7) << 7):
intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary
intPaddingLeft = int((intWidth_pad - intWidth) / 2)
intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
else:
intPaddingLeft = 32
intPaddingRight= 32
if intHeight != ((intHeight >> 7) << 7):
intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary
intPaddingTop = int((intHeight_pad - intHeight) / 2)
intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
else:
intPaddingTop = 32
intPaddingBottom = 32
pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight, intPaddingTop, intPaddingBottom])
X0 = Variable(torch.unsqueeze(X0,0))
X1 = Variable(torch.unsqueeze(X1,0))
X0 = pader(X0)
X1 = pader(X1)
if args.use_cuda:
X0 = X0.cuda()
X1 = X1.cuda()
y_s, offset, filter = model(torch.stack((X0, X1),dim = 0))
y_ = y_s[args.save_which]
if args.use_cuda:
X0 = X0.data.cpu().numpy()
if not isinstance(y_, list):
y_ = y_.data.cpu().numpy()
else:
y_ = [item.data.cpu().numpy() for item in y_]
offset = [offset_i.data.cpu().numpy() for offset_i in offset]
filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None
X1 = X1.data.cpu().numpy()
else:
X0 = X0.data.numpy()
if not isinstance(y_, list):
y_ = y_.data.numpy()
else:
y_ = [item.data.numpy() for item in y_]
offset = [offset_i.data.numpy() for offset_i in offset]
filter = [filter_i.data.numpy() for filter_i in filter]
X1 = X1.data.numpy()
X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,
intPaddingLeft:intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]
offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
filter = [np.transpose(
filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
(1, 2, 0)) for filter_i in filter] if filter is not None else None
X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
interpolated_frame_number = 0
shutil.copy(filename_frame_1, os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png"))
for item, time_offset in zip(y_, time_offsets):
interpolated_frame_number += 1
output_frame_file_path = os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png")
imsave(output_frame_file_path, np.round(item).astype(numpy.uint8))
end_time = time.time()
loop_timer.update(end_time - start_time)
frames_left = final_frame - input_frame
estimated_seconds_left = frames_left * loop_timer.avg
estimated_time_left = datetime.timedelta(seconds=estimated_seconds_left)
print(f"****** Processed frame {input_frame} | Time per frame (avg): {loop_timer.avg:2.2f}s | Time left: {estimated_time_left} ******************" )
# Copying last frame
last_frame_filename = os.path.join(frames_dir, str(str(final_frame).zfill(5))+'.png')
shutil.copy(last_frame_filename, os.path.join(output_dir, f"{final_frame:0>5d}{0:0>3d}.png"))
print("Finished processing images.")
================================================
FILE: datasets/Vimeo_90K_interp.py
================================================
import os.path
import random
# import glob
import math
from .listdatasets import ListDataset,Vimeo_90K_loader
def make_dataset(root, list_file):
raw_im_list = open(os.path.join(root, list_file)).read().splitlines()
# the last line is invalid in test set.
# print("The last sample is : " + raw_im_list[-1])
raw_im_list = raw_im_list[:-1]
assert len(raw_im_list) > 0
random.shuffle(raw_im_list)
return raw_im_list
def Vimeo_90K_interp(root, split=1.0, single=False, task = 'interp' ):
train_list = make_dataset(root,"tri_trainlist.txt")
test_list = make_dataset(root,"tri_testlist.txt")
train_dataset = ListDataset(root, train_list, loader=Vimeo_90K_loader)
test_dataset = ListDataset(root, test_list, loader=Vimeo_90K_loader)
return train_dataset, test_dataset
================================================
FILE: datasets/__init__.py
================================================
from .Vimeo_90K_interp import Vimeo_90K_interp
__all__ = (
'Vimeo_90K_interp',
)
# Vimeo_90K = "/tmp4/wenbobao_data/vimeo_triplet"
================================================
FILE: datasets/listdatasets.py
================================================
import torch.utils.data as data
import os
import os.path
from scipy.ndimage import imread
import numpy as np
import random
def Vimeo_90K_loader(root, im_path, input_frame_size = (3, 256, 448), output_frame_size = (3, 256, 448), data_aug = True):
root = os.path.join(root,'sequences',im_path)
if data_aug and random.randint(0, 1):
path_pre2 = os.path.join(root, "im1.png")
path_mid = os.path.join(root, "im2.png")
path_pre1 = os.path.join(root, "im3.png")
else:
path_pre1 = os.path.join(root, "im1.png")
path_mid = os.path.join(root, "im2.png")
path_pre2 = os.path.join(root, "im3.png")
im_pre2 = imread(path_pre2)
im_pre1 = imread(path_pre1)
im_mid = imread(path_mid)
h_offset = random.choice(range(256 - input_frame_size[1] + 1))
w_offset = random.choice(range(448 - input_frame_size[2] + 1))
im_pre2 = im_pre2[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]
im_pre1 = im_pre1[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]
im_mid = im_mid[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]
if data_aug:
if random.randint(0, 1):
im_pre2 = np.fliplr(im_pre2)
im_mid = np.fliplr(im_mid)
im_pre1 = np.fliplr(im_pre1)
if random.randint(0, 1):
im_pre2 = np.flipud(im_pre2)
im_mid = np.flipud(im_mid)
im_pre1 = np.flipud(im_pre1)
X0 = np.transpose(im_pre1,(2,0,1))
X2 = np.transpose(im_pre2, (2, 0, 1))
y = np.transpose(im_mid, (2, 0, 1))
return X0.astype("float32")/ 255.0, \
X2.astype("float32")/ 255.0,\
y.astype("float32")/ 255.0
class ListDataset(data.Dataset):
def __init__(self, root, path_list, loader=Vimeo_90K_loader):
self.root = root
self.path_list = path_list
self.loader = loader
def __getitem__(self, index):
path = self.path_list[index]
# print(path)
image_0,image_2,image_1 = self.loader(self.root, path)
return image_0,image_2,image_1
def __len__(self):
return len(self.path_list)
================================================
FILE: demo_MiddleBury.py
================================================
import time
import os
from torch.autograd import Variable
import math
import torch
import random
import numpy as np
import numpy
import networks
from my_args import args
from scipy.misc import imread, imsave
from AverageMeter import *
torch.backends.cudnn.benchmark = True # to speed up the
DO_MiddleBurryOther = True
MB_Other_DATA = "./MiddleBurySet/other-data/"
MB_Other_RESULT = "./MiddleBurySet/other-result-author/"
MB_Other_GT = "./MiddleBurySet/other-gt-interp/"
if not os.path.exists(MB_Other_RESULT):
os.mkdir(MB_Other_RESULT)
model = networks.__dict__[args.netName](channel=args.channels,
filter_size = args.filter_size ,
timestep=args.time_step,
training=False)
if args.use_cuda:
model = model.cuda()
args.SAVED_MODEL = './model_weights/best.pth'
if os.path.exists(args.SAVED_MODEL):
print("The testing model weight is: " + args.SAVED_MODEL)
if not args.use_cuda:
pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)
# model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))
else:
pretrained_dict = torch.load(args.SAVED_MODEL)
# model.load_state_dict(torch.load(args.SAVED_MODEL))
model_dict = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
# 4. release the pretrained dict for saving memory
pretrained_dict = []
else:
print("*****************************************************************")
print("**** We don't load any trained weights **************************")
print("*****************************************************************")
model = model.eval() # deploy mode
use_cuda=args.use_cuda
save_which=args.save_which
dtype = args.dtype
unique_id =str(random.randint(0, 100000))
print("The unique id for current testing is: " + str(unique_id))
interp_error = AverageMeter()
if DO_MiddleBurryOther:
subdir = os.listdir(MB_Other_DATA)
gen_dir = os.path.join(MB_Other_RESULT, unique_id)
os.mkdir(gen_dir)
tot_timer = AverageMeter()
proc_timer = AverageMeter()
end = time.time()
for dir in subdir:
print(dir)
os.mkdir(os.path.join(gen_dir, dir))
arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png")
arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png")
arguments_strOut = os.path.join(gen_dir, dir, "frame10i11.png")
gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png")
X0 = torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
X1 = torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
y_ = torch.FloatTensor()
assert (X0.size(1) == X1.size(1))
assert (X0.size(2) == X1.size(2))
intWidth = X0.size(2)
intHeight = X0.size(1)
channel = X0.size(0)
if not channel == 3:
continue
if intWidth != ((intWidth >> 7) << 7):
intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary
intPaddingLeft =int(( intWidth_pad - intWidth)/2)
intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
else:
intWidth_pad = intWidth
intPaddingLeft = 32
intPaddingRight= 32
if intHeight != ((intHeight >> 7) << 7):
intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary
intPaddingTop = int((intHeight_pad - intHeight) / 2)
intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
else:
intHeight_pad = intHeight
intPaddingTop = 32
intPaddingBottom = 32
pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])
torch.set_grad_enabled(False)
X0 = Variable(torch.unsqueeze(X0,0))
X1 = Variable(torch.unsqueeze(X1,0))
X0 = pader(X0)
X1 = pader(X1)
if use_cuda:
X0 = X0.cuda()
X1 = X1.cuda()
proc_end = time.time()
y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))
y_ = y_s[save_which]
proc_timer.update(time.time() -proc_end)
tot_timer.update(time.time() - end)
end = time.time()
print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" )
if use_cuda:
X0 = X0.data.cpu().numpy()
y_ = y_.data.cpu().numpy()
offset = [offset_i.data.cpu().numpy() for offset_i in offset]
filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None
X1 = X1.data.cpu().numpy()
else:
X0 = X0.data.numpy()
y_ = y_.data.numpy()
offset = [offset_i.data.numpy() for offset_i in offset]
filter = [filter_i.data.numpy() for filter_i in filter]
X1 = X1.data.numpy()
X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
y_ = np.transpose(255.0 * y_.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
filter = [np.transpose(
filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
(1, 2, 0)) for filter_i in filter] if filter is not None else None
X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
imsave(arguments_strOut, np.round(y_).astype(numpy.uint8))
rec_rgb = imread(arguments_strOut)
gt_rgb = imread(gt_path)
diff_rgb = 128.0 + rec_rgb - gt_rgb
avg_interp_error_abs = np.mean(np.abs(diff_rgb - 128.0))
interp_error.update(avg_interp_error_abs, 1)
mse = numpy.mean((diff_rgb - 128.0) ** 2)
PIXEL_MAX = 255.0
psnr = 20 * math.log10(PIXEL_MAX / math.sqrt(mse))
print("interpolation error / PSNR : " + str(round(avg_interp_error_abs,4)) + " / " + str(round(psnr,4)))
metrics = "The average interpolation error / PSNR for all images are : " + str(round(interp_error.avg, 4))
print(metrics)
================================================
FILE: demo_MiddleBury_slowmotion.py
================================================
import time
import os
from torch.autograd import Variable
import torch
import random
import numpy as np
import numpy
import networks
from my_args import args
from scipy.misc import imread, imsave
from AverageMeter import *
import shutil
torch.backends.cudnn.benchmark = True # to speed up the
DO_MiddleBurryOther = True
MB_Other_DATA = "./MiddleBurySet/other-data/"
MB_Other_RESULT = "./MiddleBurySet/other-result-author/"
MB_Other_GT = "./MiddleBurySet/other-gt-interp/"
if not os.path.exists(MB_Other_RESULT):
os.mkdir(MB_Other_RESULT)
model = networks.__dict__[args.netName]( channel=args.channels,
filter_size = args.filter_size ,
timestep=args.time_step,
training=False)
if args.use_cuda:
model = model.cuda()
args.SAVED_MODEL = './model_weights/best.pth'
if os.path.exists(args.SAVED_MODEL):
print("The testing model weight is: " + args.SAVED_MODEL)
if not args.use_cuda:
pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)
# model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))
else:
pretrained_dict = torch.load(args.SAVED_MODEL)
# model.load_state_dict(torch.load(args.SAVED_MODEL))
model_dict = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
# 4. release the pretrained dict for saving memory
pretrained_dict = []
else:
print("*****************************************************************")
print("**** We don't load any trained weights **************************")
print("*****************************************************************")
model = model.eval() # deploy mode
use_cuda=args.use_cuda
save_which=args.save_which
dtype = args.dtype
unique_id =str(random.randint(0, 100000))
print("The unique id for current testing is: " + str(unique_id))
interp_error = AverageMeter()
if DO_MiddleBurryOther:
subdir = os.listdir(MB_Other_DATA)
gen_dir = os.path.join(MB_Other_RESULT, unique_id)
os.mkdir(gen_dir)
tot_timer = AverageMeter()
proc_timer = AverageMeter()
end = time.time()
for dir in subdir:
print(dir)
os.mkdir(os.path.join(gen_dir, dir))
arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png")
arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png")
gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png")
X0 = torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
X1 = torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
y_ = torch.FloatTensor()
assert (X0.size(1) == X1.size(1))
assert (X0.size(2) == X1.size(2))
intWidth = X0.size(2)
intHeight = X0.size(1)
channel = X0.size(0)
if not channel == 3:
continue
if intWidth != ((intWidth >> 7) << 7):
intWidth_pad = (((intWidth >> 7) + 1) << 7) # more than necessary
intPaddingLeft =int(( intWidth_pad - intWidth)/2)
intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
else:
intWidth_pad = intWidth
intPaddingLeft = 32
intPaddingRight= 32
if intHeight != ((intHeight >> 7) << 7):
intHeight_pad = (((intHeight >> 7) + 1) << 7) # more than necessary
intPaddingTop = int((intHeight_pad - intHeight) / 2)
intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
else:
intHeight_pad = intHeight
intPaddingTop = 32
intPaddingBottom = 32
pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])
torch.set_grad_enabled(False)
X0 = Variable(torch.unsqueeze(X0,0))
X1 = Variable(torch.unsqueeze(X1,0))
X0 = pader(X0)
X1 = pader(X1)
if use_cuda:
X0 = X0.cuda()
X1 = X1.cuda()
proc_end = time.time()
y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))
y_ = y_s[save_which]
proc_timer.update(time.time() -proc_end)
tot_timer.update(time.time() - end)
end = time.time()
print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" )
if use_cuda:
X0 = X0.data.cpu().numpy()
if not isinstance(y_, list):
y_ = y_.data.cpu().numpy()
else:
y_ = [item.data.cpu().numpy() for item in y_]
offset = [offset_i.data.cpu().numpy() for offset_i in offset]
filter = [filter_i.data.cpu().numpy() for filter_i in filter] if filter[0] is not None else None
X1 = X1.data.cpu().numpy()
else:
X0 = X0.data.numpy()
if not isinstance(y_, list):
y_ = y_.data.numpy()
else:
y_ = [item.data.numpy() for item in y_]
offset = [offset_i.data.numpy() for offset_i in offset]
filter = [filter_i.data.numpy() for filter_i in filter]
X1 = X1.data.numpy()
X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,
intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]
offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
filter = [np.transpose(
filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
(1, 2, 0)) for filter_i in filter] if filter is not None else None
X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
timestep = args.time_step
numFrames = int(1.0 / timestep) - 1
time_offsets = [kk * timestep for kk in range(1, 1 + numFrames, 1)]
# for item, time_offset in zip(y_,time_offsets):
# arguments_strOut = os.path.join(gen_dir, dir, "frame10_i{:.3f}_11.png".format(time_offset))
#
# imsave(arguments_strOut, np.round(item).astype(numpy.uint8))
#
# # copy the first and second reference frame
# shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, "frame10_i{:.3f}_11.png".format(0)))
# shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, "frame11_i{:.3f}_11.png".format(1)))
count = 0
shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count)))
count = count+1
for item, time_offset in zip(y_, time_offsets):
arguments_strOut = os.path.join(gen_dir, dir, "{:0>4d}.png".format(count))
count = count + 1
imsave(arguments_strOut, np.round(item).astype(numpy.uint8))
shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count)))
count = count + 1
================================================
FILE: environment.yaml
================================================
name: pytorch1.0.0
channels:
- pytorch
- serge-sans-paille
- anaconda
- conda-forge
- defaults
dependencies:
- ca-certificates=2019.1.23=0
- certifi=2018.11.29=py36_0
- cloudpickle=0.7.0=py_0
- cytoolz=0.9.0.1=py36h14c3975_1
- dask-core=1.1.1=py_0
- decorator=4.3.2=py36_0
- imageio=2.4.1=py36_0
- networkx=2.2=py36_1
- openssl=1.1.1=h7b6447c_0
- pywavelets=1.0.1=py36hdd07704_0
- scikit-image=0.14.1=py36he6710b0_0
- scipy=1.1.0=py36h7c811a0_0
- toolz=0.9.0=py36_0
- cycler=0.10.0=py_1
- expat=2.2.5=hf484d3e_1002
- fontconfig=2.13.1=h2176d3f_1000
- gettext=0.19.8.1=h9745a5d_1001
- glib=2.56.2=had28632_1001
- icu=58.2=hf484d3e_1000
- kiwisolver=1.0.1=py36h6bb024c_1002
- libiconv=1.15=h14c3975_1004
- libprotobuf=3.6.1=hdbcaa40_1000
- libuuid=2.32.1=h14c3975_1000
- libxcb=1.13=h14c3975_1002
- libxml2=2.9.8=h143f9aa_1005
- matplotlib=3.0.2=py36_1002
- matplotlib-base=3.0.2=py36h167e16e_1002
- protobuf=3.6.1=py36hf484d3e_1001
- pthread-stubs=0.4=h14c3975_1001
- pyparsing=2.3.1=py_0
- pyqt=5.6.0=py36h13b7fb3_1008
- python-dateutil=2.8.0=py_0
- sip=4.18.1=py36hf484d3e_1000
- tensorboardx=1.6=py_0
- tk=8.6.9=h84994c4_1000
- tornado=5.1.1=py36h14c3975_1000
- xorg-libxau=1.0.9=h14c3975_0
- xorg-libxdmcp=1.1.2=h14c3975_1007
- blas=1.0=mkl
- cffi=1.11.5=py36he75722e_1
- cudatoolkit=9.0=h13b8566_0
- dbus=1.13.2=h714fa37_1
- freetype=2.9.1=h8a8886c_1
- gst-plugins-base=1.14.0=hbbd80ab_1
- gstreamer=1.14.0=hb453b48_1
- intel-openmp=2019.1=144
- isl=0.12.2=0
- jpeg=9b=h024ee3a_2
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=8.2.0=hdf63c60_1
- libgfortran-ng=7.3.0=hdf63c60_0
- libpng=1.6.36=hbc83047_0
- libstdcxx-ng=8.2.0=hdf63c60_1
- libtiff=4.0.10=h2733197_2
- mkl=2019.1=144
- mkl_fft=1.0.10=py36ha843d7b_0
- mkl_random=1.0.2=py36hd81dba3_0
- mpc=1.0.3=hf803216_4
- mpfr=3.1.5=h12ff648_1
- ncurses=6.1=he6710b0_1
- ninja=1.8.2=py36h6bb024c_1
- numpy=1.15.4=py36h7e9f1db_0
- numpy-base=1.15.4=py36hde5b4d6_0
- olefile=0.46=py36_0
- pcre=8.42=h439df22_0
- pillow=5.4.1=py36h34e0f95_0
- pip=19.0.1=py36_0
- pycparser=2.19=py36_0
- python=3.6.8=h0371630_0
- qt=5.6.3=h8bf5577_3
- readline=7.0=h7b6447c_5
- setuptools=40.8.0=py36_0
- six=1.12.0=py36_0
- sqlite=3.26.0=h7b6447c_0
- wheel=0.32.3=py36_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
- zstd=1.3.7=h0b5b093_0
- pytorch=1.0.1=py3.6_cuda9.0.176_cudnn7.4.2_2
- torchvision=0.2.1=py_2
- cloog=0.18.1=1
- gcc_49=4.9.1=6
- gmp=5.1.3=0
- pip:
- correlation-cuda==0.0.0
- dask==1.1.1
- depthflowprojection-cuda==0.0.0
- filterinterpolation-cuda==0.0.0
- flowprojection-cuda==0.0.0
- interpolation-cuda==0.0.0
- interpolationch-cuda==0.0.0
- mindepthflowprojection-cuda==0.0.0
- separableconv-cuda==0.0.0
- separableconvflow-cuda==0.0.0
- torch==1.0.1.post2
prefix: /home/wenbobao/anaconda3_new/envs/pytorch1.0.0
================================================
FILE: loss_function.py
================================================
import sys
import os
import sys
import threading
import torch
from torch.autograd import Variable
from lr_scheduler import *
from torch.autograd import gradcheck
import numpy
def charbonier_loss(x,epsilon):
loss = torch.mean(torch.sqrt(x * x + epsilon * epsilon))
return loss
def negPSNR_loss(x,epsilon):
loss = torch.mean(torch.mean(torch.mean(torch.sqrt(x * x + epsilon * epsilon),dim=1),dim=1),dim=1)
return torch.mean(-torch.log(1.0/loss) /100.0)
def tv_loss(x,epsilon):
loss = torch.mean( torch.sqrt(
(x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) ** 2 +
(x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) ** 2 + epsilon *epsilon
)
)
return loss
def gra_adap_tv_loss(flow, image, epsilon):
w = torch.exp( - torch.sum( torch.abs(image[:,:,:-1, :-1] - image[:,:,1:, :-1]) +
torch.abs(image[:,:,:-1, :-1] - image[:,:,:-1, 1:]), dim = 1))
tv = torch.sum(torch.sqrt((flow[:, :, :-1, :-1] - flow[:, :, 1:, :-1]) ** 2 + (flow[:, :, :-1, :-1] - flow[:, :, :-1, 1:]) ** 2 + epsilon *epsilon) ,dim=1)
loss = torch.mean( w * tv )
return loss
def smooth_loss(x,epsilon):
loss = torch.mean(
torch.sqrt(
(x[:,:,:-1,:-1] - x[:,:,1:,:-1]) **2 +
(x[:,:,:-1,:-1] - x[:,:,:-1,1:]) **2+ epsilon**2
)
)
return loss
def motion_sym_loss(offset, epsilon, occlusion = None):
if occlusion == None:
# return torch.mean(torch.sqrt( (offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))
return torch.mean(torch.sqrt( (offset[0] + offset[1])**2 + epsilon **2))
else:
# TODO: how to design the occlusion aware offset symmetric loss?
# return torch.mean(torch.sqrt((offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))
return torch.mean(torch.sqrt((offset[0] + offset[1])**2 + epsilon **2))
def part_loss(diffs, offsets, occlusions, images, epsilon, use_negPSNR=False):
if use_negPSNR:
pixel_loss = [negPSNR_loss(diff, epsilon) for diff in diffs]
else:
pixel_loss = [charbonier_loss(diff, epsilon) for diff in diffs]
#offset_loss = [tv_loss(offset[0], epsilon) + tv_loss(offset[1], epsilon) for offset in
# offsets]
if offsets[0][0] is not None:
offset_loss = [gra_adap_tv_loss(offset[0],images[0], epsilon) + gra_adap_tv_loss(offset[1], images[1], epsilon) for offset in
offsets]
else:
offset_loss = [Variable(torch.zeros(1).cuda())]
# print(torch.max(occlusions[0]))
# print(torch.min(occlusions[0]))
# print(torch.mean(occlusions[0]))
# occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion - 0.5, epsilon) for occlusion in occlusions]
# occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion[:, 0, ...] - occlusion[:, 1, ...], epsilon) for occlusion in occlusions]
sym_loss = [motion_sym_loss(offset,epsilon=epsilon) for offset in offsets]
# sym_loss = [ motion_sym_loss(offset,occlusion) for offset,occlusion in zip(offsets,occlusions)]
return pixel_loss, offset_loss, sym_loss
================================================
FILE: lr_scheduler.py
================================================
from bisect import bisect_right
from torch.optim.optimizer import Optimizer
class _LRScheduler(object):
def __init__(self, optimizer, last_epoch=-1):
if not isinstance(optimizer, Optimizer):
raise TypeError('{} is not an Optimizer'.format(
type(optimizer).__name__))
self.optimizer = optimizer
if last_epoch == -1:
for group in optimizer.param_groups:
group.setdefault('initial_lr', group['lr'])
else:
for i, group in enumerate(optimizer.param_groups):
if 'initial_lr' not in group:
raise KeyError("param 'initial_lr' is not specified "
"in param_groups[{}] when resuming an optimizer".format(i))
self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
self.step(last_epoch + 1)
self.last_epoch = last_epoch
def get_lr(self):
raise NotImplementedError
def step(self, epoch=None):
if epoch is None:
epoch = self.last_epoch + 1
self.last_epoch = epoch
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
class LambdaLR(_LRScheduler):
"""Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr.
Args:
optimizer (Optimizer): Wrapped optimizer.
lr_lambda (function or list): A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups.
last_epoch (int): The index of last epoch. Default: -1.
Example:
>>> # Assuming optimizer has two groups.
>>> lambda1 = lambda epoch: epoch // 30
>>> lambda2 = lambda epoch: 0.95 ** epoch
>>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
>>> for epoch in range(100):
>>> scheduler.step()
>>> train(...)
>>> validate(...)
"""
def __init__(self, optimizer, lr_lambda, last_epoch=-1):
self.optimizer = optimizer
if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
else:
if len(lr_lambda) != len(optimizer.param_groups):
raise ValueError("Expected {} lr_lambdas, but got {}".format(
len(optimizer.param_groups), len(lr_lambda)))
self.lr_lambdas = list(lr_lambda)
self.last_epoch = last_epoch
super(LambdaLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
return [base_lr * lmbda(self.last_epoch)
for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
class StepLR(_LRScheduler):
"""Sets the learning rate of each parameter group to the initial lr
decayed by gamma every step_size epochs. When last_epoch=-1, sets
initial lr as lr.
Args:
optimizer (Optimizer): Wrapped optimizer.
step_size (int): Period of learning rate decay.
gamma (float): Multiplicative factor of learning rate decay.
Default: 0.1.
last_epoch (int): The index of last epoch. Default: -1.
Example:
>>> # Assuming optimizer uses lr = 0.5 for all groups
>>> # lr = 0.05 if epoch < 30
>>> # lr = 0.005 if 30 <= epoch < 60
>>> # lr = 0.0005 if 60 <= epoch < 90
>>> # ...
>>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
>>> for epoch in range(100):
>>> scheduler.step()
>>> train(...)
>>> validate(...)
"""
def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
self.step_size = step_size
self.gamma = gamma
super(StepLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
for base_lr in self.base_lrs]
class MultiStepLR(_LRScheduler):
"""Set the learning rate of each parameter group to the initial lr decayed
by gamma once the number of epoch reaches one of the milestones. When
last_epoch=-1, sets initial lr as lr.
Args:
optimizer (Optimizer): Wrapped optimizer.
milestones (list): List of epoch indices. Must be increasing.
gamma (float): Multiplicative factor of learning rate decay.
Default: 0.1.
last_epoch (int): The index of last epoch. Default: -1.
Example:
>>> # Assuming optimizer uses lr = 0.5 for all groups
>>> # lr = 0.05 if epoch < 30
>>> # lr = 0.005 if 30 <= epoch < 80
>>> # lr = 0.0005 if epoch >= 80
>>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
>>> for epoch in range(100):
>>> scheduler.step()
>>> train(...)
>>> validate(...)
"""
def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
if not list(milestones) == sorted(milestones):
raise ValueError('Milestones should be a list of'
' increasing integers. Got {}', milestones)
self.milestones = milestones
self.gamma = gamma
super(MultiStepLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
for base_lr in self.base_lrs]
class ExponentialLR(_LRScheduler):
"""Set the learning rate of each parameter group to the initial lr decayed
by gamma every epoch. When last_epoch=-1, sets initial lr as lr.
Args:
optimizer (Optimizer): Wrapped optimizer.
gamma (float): Multiplicative factor of learning rate decay.
last_epoch (int): The index of last epoch. Default: -1.
"""
def __init__(self, optimizer, gamma, last_epoch=-1):
self.gamma = gamma
super(ExponentialLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
return [base_lr * self.gamma ** self.last_epoch
for base_lr in self.base_lrs]
class ReduceLROnPlateau(object):
"""Reduce learning rate when a metric has stopped improving.
Models often benefit from reducing the learning rate by a factor
of 2-10 once learning stagnates. This scheduler reads a metrics
quantity and if no improvement is seen for a 'patience' number
of epochs, the learning rate is reduced.
Args:
optimizer (Optimizer): Wrapped optimizer.
mode (str): One of `min`, `max`. In `min` mode, lr will
be reduced when the quantity monitored has stopped
decreasing; in `max` mode it will be reduced when the
quantity monitored has stopped increasing. Default: 'min'.
factor (float): Factor by which the learning rate will be
reduced. new_lr = lr * factor. Default: 0.1.
patience (int): Number of epochs with no improvement after
which learning rate will be reduced. Default: 10.
verbose (bool): If True, prints a message to stdout for
each update. Default: False.
threshold (float): Threshold for measuring the new optimum,
to only focus on significant changes. Default: 1e-4.
threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
dynamic_threshold = best * ( 1 + threshold ) in 'max'
mode or best * ( 1 - threshold ) in `min` mode.
In `abs` mode, dynamic_threshold = best + threshold in
`max` mode or best - threshold in `min` mode. Default: 'rel'.
cooldown (int): Number of epochs to wait before resuming
normal operation after lr has been reduced. Default: 0.
min_lr (float or list): A scalar or a list of scalars. A
lower bound on the learning rate of all param groups
or each group respectively. Default: 0.
eps (float): Minimal decay applied to lr. If the difference
between new and old lr is smaller than eps, the update is
ignored. Default: 1e-8.
Example:
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> scheduler = ReduceLROnPlateau(optimizer, 'min')
>>> for epoch in range(10):
>>> train(...)
>>> val_loss = validate(...)
>>> # Note that step should be called after validate()
>>> scheduler.step(val_loss)
"""
def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
verbose=False, threshold=1e-4, threshold_mode='rel',
cooldown=0, min_lr=0, eps=1e-8):
if factor >= 1.0:
raise ValueError('Factor should be < 1.0.')
self.factor = factor
if not isinstance(optimizer, Optimizer):
raise TypeError('{} is not an Optimizer'.format(
type(optimizer).__name__))
self.optimizer = optimizer
if isinstance(min_lr, list) or isinstance(min_lr, tuple):
if len(min_lr) != len(optimizer.param_groups):
raise ValueError("expected {} min_lrs, got {}".format(
len(optimizer.param_groups), len(min_lr)))
self.min_lrs = list(min_lr)
else:
self.min_lrs = [min_lr] * len(optimizer.param_groups)
self.patience = patience
self.verbose = verbose
self.cooldown = cooldown
self.cooldown_counter = 0
self.mode = mode
self.threshold = threshold
self.threshold_mode = threshold_mode
self.best = None
self.num_bad_epochs = None
self.mode_worse = None # the worse value for the chosen mode
self.is_better = None
self.eps = eps
self.last_epoch = -1
self._init_is_better(mode=mode, threshold=threshold,
threshold_mode=threshold_mode)
self._reset()
def _reset(self):
"""Resets num_bad_epochs counter and cooldown counter."""
self.best = self.mode_worse
self.cooldown_counter = 0
self.num_bad_epochs = 0
def step(self, metrics, epoch=None):
current = metrics
if epoch is None:
epoch = self.last_epoch = self.last_epoch + 1
self.last_epoch = epoch
if self.is_better(current, self.best):
self.best = current
self.num_bad_epochs = 0
else:
self.num_bad_epochs += 1
if self.in_cooldown:
self.cooldown_counter -= 1
self.num_bad_epochs = 0 # ignore any bad epochs in cooldown
if self.num_bad_epochs > self.patience:
self._reduce_lr(epoch)
self.cooldown_counter = self.cooldown
self.num_bad_epochs = 0
def _reduce_lr(self, epoch):
for i, param_group in enumerate(self.optimizer.param_groups):
old_lr = float(param_group['lr'])
new_lr = max(old_lr * self.factor, self.min_lrs[i])
if old_lr - new_lr > self.eps:
param_group['lr'] = new_lr
if self.verbose:
print('Epoch {:5d}: reducing learning rate'
' of group {} to {:.4e}.'.format(epoch, i, new_lr))
@property
def in_cooldown(self):
return self.cooldown_counter > 0
def _init_is_better(self, mode, threshold, threshold_mode):
if mode not in {'min', 'max'}:
raise ValueError('mode ' + mode + ' is unknown!')
if threshold_mode not in {'rel', 'abs'}:
raise ValueError('threshold mode ' + mode + ' is unknown!')
if mode == 'min' and threshold_mode == 'rel':
rel_epsilon = 1. - threshold
self.is_better = lambda a, best: a < best * rel_epsilon
self.mode_worse = float('Inf')
elif mode == 'min' and threshold_mode == 'abs':
self.is_better = lambda a, best: a < best - threshold
self.mode_worse = float('Inf')
elif mode == 'max' and threshold_mode == 'rel':
rel_epsilon = threshold + 1.
self.is_better = lambda a, best: a > best * rel_epsilon
self.mode_worse = -float('Inf')
else: # mode == 'max' and epsilon_mode == 'abs':
self.is_better = lambda a, best: a > best + threshold
self.mode_worse = -float('Inf')
================================================
FILE: my_args.py
================================================
import os
import datetime
import argparse
import numpy
import networks
import torch
modelnames = networks.__all__
# import datasets
datasetNames = ('Vimeo_90K_interp') #datasets.__all__
parser = argparse.ArgumentParser(description='DAIN')
parser.add_argument('--debug',action = 'store_true', help='Enable debug mode')
parser.add_argument('--netName', type=str, default='DAIN',
choices = modelnames,help = 'model architecture: ' +
' | '.join(modelnames) +
' (default: DAIN)')
parser.add_argument('--datasetName', default='Vimeo_90K_interp',
choices= datasetNames,nargs='+',
help='dataset type : ' +
' | '.join(datasetNames) +
' (default: Vimeo_90K_interp)')
parser.add_argument('--datasetPath',default='',help = 'the path of selected datasets')
parser.add_argument('--dataset_split', type = int, default=97, help = 'Split a dataset into trainining and validation by percentage (default: 97)')
parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument('--numEpoch', '-e', type = int, default=100, help= 'Number of epochs to train(default:150)')
parser.add_argument('--batch_size', '-b',type = int ,default=1, help = 'batch size (default:1)' )
parser.add_argument('--workers', '-w', type =int,default=8, help = 'parallel workers for loading training samples (default : 1.6*10 = 16)')
parser.add_argument('--channels', '-c', type=int,default=3,choices = [1,3], help ='channels of images (default:3)')
parser.add_argument('--filter_size', '-f', type=int, default=4, help = 'the size of filters used (default: 4)',
choices=[2,4,6, 5,51]
)
parser.add_argument('--lr', type =float, default= 0.002, help= 'the basic learning rate for three subnetworks (default: 0.002)')
parser.add_argument('--rectify_lr', type=float, default=0.001, help = 'the learning rate for rectify/refine subnetworks (default: 0.001)')
parser.add_argument('--save_which', '-s', type=int, default=1, choices=[0,1], help='choose which result to save: 0 ==> interpolated, 1==> rectified')
parser.add_argument('--time_step', type=float, default=0.5, help='choose the time steps')
parser.add_argument('--flow_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')
parser.add_argument('--occ_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--filter_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--ctx_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--depth_lr_coe', type = float, default=0.001, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')
# parser.add_argument('--deblur_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')
parser.add_argument('--alpha', type=float,nargs='+', default=[0.0, 1.0], help= 'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])')
parser.add_argument('--epsilon', type = float, default=1e-6, help = 'the epsilon for charbonier loss,etc (default: 1e-6)')
parser.add_argument('--weight_decay', type = float, default=0, help = 'the weight decay for whole network ' )
parser.add_argument('--patience', type=int, default=5, help = 'the patience of reduce on plateou')
parser.add_argument('--factor', type = float, default=0.2, help = 'the factor of reduce on plateou')
#
parser.add_argument('--pretrained', dest='SAVED_MODEL', default=None, help ='path to the pretrained model weights')
parser.add_argument('--no-date', action='store_true', help='don\'t append date timestamp to folder' )
parser.add_argument('--use_cuda', default= True, type = bool, help='use cuda or not')
parser.add_argument('--use_cudnn',default=1,type=int, help = 'use cudnn or not')
parser.add_argument('--dtype', default=torch.cuda.FloatTensor, choices = [torch.cuda.FloatTensor,torch.FloatTensor],help = 'tensor data type ')
# parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint (default: none)')
parser.add_argument('--uid', type=str, default= None, help='unique id for the training')
parser.add_argument('--force', action='store_true', help='force to override the given uid')
# Colab version
parser.add_argument('--start_frame', type = int, default = 1, help='first frame number to process')
parser.add_argument('--end_frame', type = int, default = 100, help='last frame number to process')
parser.add_argument('--frame_input_dir', type = str, default = '/content/DAIN/input_frames', help='frame input directory')
parser.add_argument('--frame_output_dir', type = str, default = '/content/DAIN/output_frames', help='frame output directory')
args = parser.parse_args()
import shutil
if args.uid == None:
unique_id = str(numpy.random.randint(0, 100000))
print("revise the unique id to a random numer " + str(unique_id))
args.uid = unique_id
timestamp = datetime.datetime.now().strftime("%a-%b-%d-%H-%M")
save_path = './model_weights/'+ args.uid +'-' + timestamp
else:
save_path = './model_weights/'+ str(args.uid)
# print("no pth here : " + save_path + "/best"+".pth")
if not os.path.exists(save_path + "/best"+".pth"):
# print("no pth here : " + save_path + "/best" + ".pth")
os.makedirs(save_path,exist_ok=True)
else:
if not args.force:
raise("please use another uid ")
else:
print("override this uid" + args.uid)
for m in range(1,10):
if not os.path.exists(save_path+"/log.txt.bk" + str(m)):
shutil.copy(save_path+"/log.txt", save_path+"/log.txt.bk"+str(m))
shutil.copy(save_path+"/args.txt", save_path+"/args.txt.bk"+str(m))
break
parser.add_argument('--save_path',default=save_path,help = 'the output dir of weights')
parser.add_argument('--log', default = save_path+'/log.txt', help = 'the log file in training')
parser.add_argument('--arg', default = save_path+'/args.txt', help = 'the args used')
args = parser.parse_args()
with open(args.log, 'w') as f:
f.close()
with open(args.arg, 'w') as f:
print(args)
print(args,file=f)
f.close()
if args.use_cudnn:
print("cudnn is used")
torch.backends.cudnn.benchmark = True # to speed up the
else:
print("cudnn is not used")
torch.backends.cudnn.benchmark = False # to speed up the
================================================
FILE: my_package/DepthFlowProjection/DepthFlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
#import _ext.my_lib as my_lib
import depthflowprojection_cuda as my_lib
class DepthFlowProjectionLayer(Function):
def __init__(self,requires_grad):
super(DepthFlowProjectionLayer,self).__init__()
# self.requires_grad = requires_grad
@staticmethod
def forward(ctx, input1, input2, requires_grad):
# print("Depth Aware Flow Projection")
assert(input1.is_contiguous())
assert(input2.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
# self.input2 = input2.contiguous()
fillhole = 1 if requires_grad == False else 0
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
# output = torch.zeros(input1.size())
if input1.is_cuda:
# output = output.cuda()
# count = count.cuda()
# print("correct")
count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.DepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
output = torch.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.DepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)
if err != 0:
print(err)
# output = output/count # to divide the counter
# self.count = count #to keep this
# self.output = output
ctx.save_for_backward(input1, input2,count,output)
ctx.fillhole = fillhole
# print(self.input1[0, 0, :10, :10])
# print(self.count[0, 0, :10, :10])
# print(self.input1[0, 0, -10:, -10:])
# print(self.count[0, 0, -10:, -10:])
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Filter Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
input1, input2, count, output = ctx.saved_tensors
# fillhole = ctx.fillhole
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.DepthFlowProjectionLayer_gpu_backward(input1,input2,
count, output,
gradoutput, gradinput1,gradinput2)
# print(err)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.DepthFlowProjectionLayer_cpu_backward(input1, input2,
count, output,
gradoutput, gradinput1,gradinput2)
# print(err)
if err != 0:
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1,gradinput2,None
================================================
FILE: my_package/DepthFlowProjection/DepthFlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn.modules.module import Module
from .DepthFlowProjectionLayer import DepthFlowProjectionLayer #, FlowFillholeLayer
__all__ =['DepthFlowProjectionModule']
class DepthFlowProjectionModule(Module):
def __init__(self, requires_grad = True):
super(DepthFlowProjectionModule, self).__init__()
self.requires_grad = requires_grad
# self.f = DepthFlowProjectionLayer(requires_grad)
def forward(self, input1, input2):
return DepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)
# class FlowFillholeModule(Module):
# def __init__(self,hole_value = -10000.0):
# super(FlowFillholeModule, self).__init__()
# self.f = FlowFillholeLayer()
#
# def forward(self, input1):
# return self.f(input1)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/DepthFlowProjection/__init__.py
================================================
from .DepthFlowProjectionModule import *
================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "depthflowprojection_cuda_kernel.cuh"
int DepthFlowProjectionLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
int fillhole
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!= 2) return error;
int batch = input1.size(0);
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(1) !=1 ) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
// printf("In gpu forward\n");
error = DepthFlowProjection_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch,fillhole,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
input2,
count,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int DepthFlowProjectionLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=2) return error;
int batch = input1.size(0);
if(count.size( 0) != batch) return error;
if(count.size(1) != 1) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(1) !=1 ) return error;
if(count.size(2) != h) return error;// to add some checkpoint
if(count.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
// printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = DepthFlowProjection_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
input2,
count,
output,
gradoutput,
gradinput1,
gradinput2
);
if (error) {AT_ERROR("CUDA call failed");}
//printf("Am I good in backward function %d",error);
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("DepthFlowProjectionLayer_gpu_forward", &DepthFlowProjectionLayer_gpu_forward, "DepthFlowProjection forward (CUDA)");
m.def("DepthFlowProjectionLayer_gpu_backward", &DepthFlowProjectionLayer_gpu_backward, "DepthFlowProjection backward (CUDA)");
}
================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cu
================================================
#include
#include "depthflowprojection_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void DepthFlowProjection_gpu_forward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];
float x2 = (float) (w_i) + fx;
float y2 = (float) (h_i) + fy;
if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
int ix2_L = (int) (x2);
int iy2_T = (int) (y2);
int ix2_R = min(ix2_L + 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];
atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,- temp * fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-temp * fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-temp * fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-temp * fx);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -temp * fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] , -temp * fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] , -temp * fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] , -temp * fy);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], temp * 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ,temp * 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , temp * 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ,temp * 1);
}
}
return ;
}
template
__global__ void DepthFlowProjectionAveraging_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
if(temp > 0.0f){
output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
}
}
return ;
}
template
__global__ void DepthFlowFillhole_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
if(temp <= 0.0f){
//search along the four directions,0/90/180/270, until finding at least one
int left_offset = w_i; float left_temp = 0.0f;
while(left_temp == 0.0f && left_offset - 1 >= 0){
left_offset = left_offset - 1;
left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
}
int right_offset = w_i ; float right_temp = 0.0f;
while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
right_offset = right_offset + 1 ;
right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
}
int up_offset = h_i ; float up_temp = 0.0f;
while(up_temp == 0.0f && up_offset - 1 >=0){
up_offset = up_offset - 1;
up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
}
int down_offset = h_i; float down_temp = 0.0f;
while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
down_offset = down_offset + 1;
down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
}
if(left_temp + right_temp + up_temp + down_temp <=0.0f){
//printf("Can't fill hole, find no neighbor vectors availabel\n");
return;
}
left_temp = (left_temp > 0.0f)?1:0;
right_temp = (right_temp > 0.0f)?1:0;
up_temp = (up_temp > 0.0f)?1:0;
down_temp = (down_temp > 0.0f)?1:0;
output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
}
}
return ;
}
template
__global__ void DepthFlowProjection_gpu_backward_kernelfunc(
const int nElement, const int w, const int h, const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output,
const scalar_t* __restrict__ gradoutput,
scalar_t* gradinput1,
scalar_t* gradinput2
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float x2 = (float) ( w_i ) + fx;
float y2 = (float) ( h_i ) + fy;
if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
int ix2_L = (int)(x2);
int iy2_T = (int)(y2);
int ix2_R = min(ix2_L + 1, w-1);
int iy2_B = min(iy2_T + 1, h-1);
float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];
int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp /
count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ;
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] * temp /
count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ;
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /
count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ;
int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] * temp /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ;
int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;
gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
(fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
(fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
(fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
(fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
(fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
(fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
(fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
(fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
}
}
return ;
}
int DepthFlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count,
at::Tensor& output
)
{
int error = -1;
dim3 grid;
dim3 block;
// blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
// printf("I am here\n");
//extract the data of CudaTensor and use kernel to calculate.
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {
DepthFlowProjection_gpu_forward_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am there\n");
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjectionAveraging", ([&] {
DepthFlowProjectionAveraging_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data()
);
}));
// printf("I am kao\n");
// THCudaCheck(cudaGetLastError());
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am dd\n");
if(fillhole){
// printf("use flow fill hole\n");
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowFillhole", ([&] {
DepthFlowFillhole_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data()
);
}));
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
return error;
}
}
error = 0;
return error;
}
int DepthFlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count, at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = -1;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
DepthFlowProjection_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data(),
gradoutput.data(), gradinput1.data(), gradinput2.data()
);
}));
// printf("gpu I am there\n");
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("gpu I am here\n");
error = 0;
return error;
}
================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int DepthFlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count,
at::Tensor& output
);
int DepthFlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
);
================================================
FILE: my_package/DepthFlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='depthflowprojection_cuda',
ext_modules=[
CUDAExtension('depthflowprojection_cuda', [
'depthflowprojection_cuda.cc',
'depthflowprojection_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/FilterInterpolation/FilterInterpolationLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import filterinterpolation_cuda as my_lib
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py
class FilterInterpolationLayer(Function):
def __init__(self):
super(FilterInterpolationLayer,self).__init__()
@staticmethod
def forward(ctx, input1,input2,input3):
assert(input1.is_contiguous())
assert(input2.is_contiguous())
assert (input3.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
# self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
# self.input3 = input3.contiguous()
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# output = torch.zeros(input1.size())
if input1.is_cuda :
# output = output.cuda()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
my_lib.FilterInterpolationLayer_gpu_forward(input1, input2, input3, output)
else:
output = torch.FloatTensor(input1.data.size())
my_lib.FilterInterpolationLayer_cpu_forward(input1, input2, input3, output)
ctx.save_for_backward(input1, input2,input3)
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Filter Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
# gradinput2 = torch.zeros(self.input2.size())
# gradinput3 = torch.zeros(self.input3.size())
input1, input2, input3= ctx.saved_tensors
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
gradinput3 = torch.cuda.FloatTensor().resize_(input3.size()).zero_()
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
# gradinput2 = gradinput2.cuda(self.device)
# gradinput3 = gradinput3.cuda(self.device)
err = my_lib.FilterInterpolationLayer_gpu_backward(input1,input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
err = my_lib.FilterInterpolationLayer_cpu_backward(input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1, gradinput2,gradinput3
# calculate the weights of flow
class WeightLayer(Function):
def __init__(self, lambda_e = 10.0/255.0, lambda_v = 1.0, Nw = 3):
#lambda_e = 10.0 , lambda_v = 1.0, Nw = 3,
super(WeightLayer,self).__init__()
self.lambda_e = lambda_e
self.lambda_v = lambda_v
self.Nw = Nw
# flow1_grad
def forward(self, input1,input2,input3):
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
self.input1 = input1.contiguous() # ref1 image
self.input2 = input2.contiguous() # ref2 image
self.input3 = input3.contiguous()
# self.flow1_grad = flow1_grad.contiguous() # ref1 flow's grad
if input1.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
output = torch.zeros(input1.size(0), 1 , input1.size(2), input1.size(3))
if input1.is_cuda :
output = output.cuda()
err = my_lib.WeightLayer_gpu_forward(input1, input2, input3,
# flow1_grad,
output,
self.lambda_e, self.lambda_v, self.Nw
)
if err != 0 :
print(err)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.WeightLayer_cpu_forward(input1, input2, input3, output,
self.lambda_e , self.lambda_v, self.Nw
)
if err != 0 :
print(err)
self.output = output # save this for fast back propagation
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
def backward(self, gradoutput):
# print("Backward of WeightLayer Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
gradinput1 = torch.zeros(self.input1.size())
gradinput2 = torch.zeros(self.input2.size())
gradinput3 = torch.zeros(self.input3.size())
# gradflow1_grad = torch.zeros(self.flow1_grad.size())
if self.input1.is_cuda:
#print("CUDA backward")
gradinput1 = gradinput1.cuda(self.device)
gradinput2 = gradinput2.cuda(self.device)
gradinput3 = gradinput3.cuda(self.device)
# gradflow1_grad = gradflow1_grad.cuda(self.device)
err = my_lib.WeightLayer_gpu_backward(
self.input1,self.input2,self.input3, self.output,
gradoutput,
gradinput1, gradinput2, gradinput3,
self.lambda_e, self.lambda_v, self.Nw
)
if err != 0 :
print(err)
else:
#print("CPU backward")
# print(gradoutput)
err = my_lib.WeightLayer_cpu_backward(
self.input1, self.input2,self.input3, self.output,
gradoutput,
gradinput1, gradinput2, gradinput3,
self.lambda_e, self.lambda_v, self.Nw
)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print("from 1:")
# print(gradinput3[0,0,...])
return gradinput1, gradinput2, gradinput3
class PixelValueLayer(Function):
def __init__(self, sigma_d = 3, tao_r = 0.05, Prowindow = 2 ):
super(PixelValueLayer,self).__init__()
self.sigma_d = sigma_d
self.tao_r = tao_r #maybe not useable
self.Prowindow = Prowindow
def forward(self, input1, input3, flow_weights):
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
self.input1 = input1.contiguous() # ref1 image
#self.input2 = input2.contiguous() # ref2 image
self.input3 = input3.contiguous() # ref1 flow
self.flow_weights = flow_weights.contiguous() # ref1 flow weights
if input1.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
output = torch.zeros(input1.size())
if input1.is_cuda:
output = output.cuda()
err = my_lib.PixelValueLayer_gpu_forward(
input1, input3, flow_weights, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.PixelValueLayer_cpu_forward(
input1, input3, flow_weights, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
def backward(self, gradoutput):
# print("Backward of PixelValueLayer Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
gradinput1 = torch.zeros(self.input1.size())
#gradinput2 = torch.zeros(self.input2.size())
gradinput3 = torch.zeros(self.input3.size())
gradflow_weights = torch.zeros(self.flow_weights.size())
if self.input1.is_cuda:
# print("CUDA backward")
gradinput1 = gradinput1.cuda(self.device)
#gradinput2 = gradinput2.cuda(self.device)
gradinput3 = gradinput3.cuda(self.device)
gradflow_weights = gradflow_weights.cuda(self.device)
err = my_lib.PixelValueLayer_gpu_backward(
self.input1,self.input3, self.flow_weights,
gradoutput,
gradinput1, gradinput3, gradflow_weights,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
#print("CPU backward")
# print(gradoutput)
err = my_lib.PixelValueLayer_cpu_backward(
self.input1, self.input3, self.flow_weights,
gradoutput,
gradinput1, gradinput3, gradflow_weights,
self.sigma_d, self.tao_r , self.Prowindow
)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print("from 2:")
# print(gradinput3[0,0,...])
# print("Image grad:")
# print(gradinput1[0,:,:4,:4])
# print("Flow grad:")
# print(gradinput3[0,:,:4,:4])
# print("Flow_weights grad:")
# print(gradflow_weights[0,:,:4,:4])
return gradinput1, gradinput3, gradflow_weights
class PixelWeightLayer(Function):
def __init__(self,threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):
super(PixelWeightLayer,self).__init__()
self.threshhold = threshhold
self.sigma_d = sigma_d
self.tao_r = tao_r #maybe not useable
self.Prowindow = Prowindow
def forward(self, input3, flow_weights):
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
#self.input1 = input1.contiguous() # ref1 image
#self.input2 = input2.contiguous() # ref2 image
self.input3 = input3.contiguous() # ref1 flow
self.flow_weights = flow_weights.contiguous() # ref1 flow weights
if input3.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
output = torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)])
if input3.is_cuda :
output = output.cuda()
err = my_lib.PixelWeightLayer_gpu_forward(
input3, flow_weights, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.PixelWeightLayer_cpu_forward(
input3, flow_weights, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
self.output = output
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
def backward(self, gradoutput):
# print("Backward of PixelWeightLayer Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
#gradinput1 = torch.zeros(self.input1.size())
#gradinput2 = torch.zeros(self.input2.size())
gradinput3 = torch.zeros(self.input3.size())
gradflow_weights = torch.zeros(self.flow_weights.size())
if self.input3.is_cuda:
# print("CUDA backward")
#gradinput1 = gradinput1.cuda(self.device)
#gradinput2 = gradinput2.cuda(self.device)
gradinput3 = gradinput3.cuda(self.device)
gradflow_weights = gradflow_weights.cuda(self.device)
err = my_lib.PixelWeightLayer_gpu_backward(
self.input3, self.flow_weights, self.output,
gradoutput,
gradinput3, gradflow_weights,
self.threshhold,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
err = my_lib.PixelWeightLayer_cpu_backward(
self.input3, self.flow_weights, self.output,
gradoutput,
gradinput3, gradflow_weights,
self.threshhold,
self.sigma_d, self.tao_r , self.Prowindow
)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print("from 3:")
# print(gradinput3[0,0,...])
return gradinput3, gradflow_weights
#class ReliableValueLayer(Function):
# def __init__(self, Nw =3, tao_r =0.05, Prowindow = 2 ):
# super(ReliableValueLayer,self).__init__()
#
# self.Nw = Nw
# self.tao_r = tao_r #maybe not useable
# self.Prowindow = Prowindow
#
# def forward(self, input3, flow_weight1):
#
# # assert(input1.is_contiguous())
# # assert(input2.is_contiguous())
# #self.input1 = input1.contiguous() # ref1 image
# #self.input2 = input2.contiguous() # ref2 image
# self.input3 = input3.contiguous() # ref1 flow
# self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights
#
# if input3.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
#
# output = torch.zeros([intpu3.size(0), 1, input3.size(2), input3.size(3)])
# #output2 = torch.zeros(input1.size())
# #weight1 = torch.zeros(input1.size())
# #weight2 = torch.zeros(input1.size())
#
#
# if input1.is_cuda :
# output = output.cuda()
# my_lib.ReliableValueLayer_gpu_forward(
# input3, flow_weight1, output,
# self.sigma_d, self.tao_r , self.Prowindow )
# else:
# # output = torch.cuda.FloatTensor(input1.data.size())
# my_lib.ReliableValueLayer_cpu_forward(
# input3, flow_weight1, output,
# self.sigma_d, self.tao_r , self.Prowindow )
#
# # the function returns the output to its caller
# return output
#
# #TODO: if there are multiple outputs of this function, then the order should be well considered?
# def backward(self, gradoutput):
# # print("Backward of Filter Interpolation Layer")
# # gradinput1 = input1.new().zero_()
# # gradinput2 = input2.new().zero_()
# #gradinput1 = torch.zeros(self.input1.size())
# #gradinput2 = torch.zeros(self.input2.size())
# gradinput3 = torch.zeros(self.input3.size())
# gradflow_weight1 = torch.zeros(self.flow_weight1.size())
#
# if self.input1.is_cuda:
# # print("CUDA backward")
# #gradinput1 = gradinput1.cuda(self.device)
# #gradinput2 = gradinput2.cuda(self.device)
# gradinput3 = gradinput3.cuda(self.device)
# gradflow_weight1 = gradflow_weight1.cuda(self.device)
#
# err = my_lib.ReliableValueLayer_gpu_backward(
# self.input3, self.flow_weight1, gradoutput,
# gradinput3, gradflow_weight1,
# self.sigma_d, self.tao_r , self.Prowindow )
# if err != 0 :
# print(err)
#
# else:
# # print("CPU backward")
# # print(gradoutput)
# err = my_lib.ReliableValueLayer_cpu_backward(
# self.input3,self.flow_weight1, gradoutput,
# gradinput3, gradflow_weight1,
# self.sigma_d, self.tao_r , self.Prowindow )
# # print(err)
# if err != 0 :
# print(err)
# # print(gradinput1)
# # print(gradinput2)
#
# # print(gradinput1)
#
# return gradinput3,gradflow_weight1
class ReliableWeightLayer(Function):
def __init__(self, threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):
super(ReliableWeightLayer,self).__init__()
self.threshhold = threshhold
self.sigma_d = sigma_d
self.tao_r = tao_r #maybe not useable
self.Prowindow = Prowindow
def forward(self, input3):
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
#self.input1 = input1.contiguous() # ref1 image
#self.input2 = input2.contiguous() # ref2 image
self.input3 = input3.contiguous() # ref1 flow
#self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights
if input3.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
output = torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)] )
#output2 = torch.zeros(input1.size())
#weight1 = torch.zeros(input1.size())
#weight2 = torch.zeros(input1.size())
if input3.is_cuda :
output = output.cuda()
err = my_lib.ReliableWeightLayer_gpu_forward(
input3, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.ReliableWeightLayer_cpu_forward(
input3, output,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
self.output= output # used for inihibiting some unreliable gradients.
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
def backward(self, gradoutput):
#print("Backward of ReliableWeightLayer Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
#gradinput1 = torch.zeros(self.input1.size())
#gradinput2 = torch.zeros(self.input2.size())
gradinput3 = torch.zeros(self.input3.size())
#gradflow_weight1 = torch.zeros(self.flow_weight1.size())
if self.input3.is_cuda:
#print("CUDA backward")
#gradinput1 = gradinput1.cuda(self.device)
#gradinput2 = gradinput2.cuda(self.device)
gradinput3 = gradinput3.cuda(self.device)
#gradflow_weight1 = gradflow_weight1.cuda(self.device)
err = my_lib.ReliableWeightLayer_gpu_backward(
self.input3, self.output,
gradoutput,
gradinput3,
self.threshhold,
self.sigma_d, self.tao_r , self.Prowindow
)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
err = my_lib.ReliableWeightLayer_cpu_backward(
self.input3, self.output,
gradoutput,
gradinput3,
self.threshhold,
self.sigma_d, self.tao_r , self.Prowindow
)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print("from 4:")
# print(gradinput3[0,0,...])
return gradinput3
================================================
FILE: my_package/FilterInterpolation/FilterInterpolationModule.py
================================================
# modules/AdaptiveInterpolationLayer.py
from torch.nn import Module
import torch
from torch.autograd import Variable
from torch.autograd import gradcheck
from .FilterInterpolationLayer import FilterInterpolationLayer,WeightLayer, PixelValueLayer,PixelWeightLayer,ReliableWeightLayer
class FilterInterpolationModule(Module):
def __init__(self):
super(FilterInterpolationModule, self).__init__()
# self.f = FilterInterpolationLayer()
def forward(self, input1, input2, input3):
return FilterInterpolationLayer.apply(input1, input2, input3)
#we actually dont need to write the backward code for a module, since we have
#class WeightModule(Module):
# def __init__(self):
# super(WeightModule, self).__init__()
# self.f = WeightLayer()
#
# def forward(self, input1, input2, input3):
# return self.f(input1, input2, input3)
class AdaptiveWeightInterpolationModule(Module):
def __init__(self, training = False, threshhold = 1e-6,
lambda_e = 30.0/255.0, lambda_v = 1.0, Nw = 3.0,
sigma_d =1.5, tao_r = 0.05, Prowindow = 2 ):
super(AdaptiveWeightInterpolationModule, self).__init__()
self.calc_weight1 = WeightLayer(lambda_e, lambda_v, Nw )
self.padder1 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])
self.interpolate1 = PixelValueLayer(sigma_d, tao_r , Prowindow)
self.interpolate1_1 = PixelWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)
# self.interpolate_R1 = ReliableValueLayer(Nw, tao_r , Prowindow)
self.interpolate_R1_1 = ReliableWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)
self.calc_weight2 = WeightLayer(lambda_e, lambda_v,Nw)
self.padder2 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])
self.interpolate2 = PixelValueLayer(sigma_d, tao_r , Prowindow )
self.interpolate2_1 = PixelWeightLayer(101*threshhold,sigma_d,tao_r, Prowindow)
#self.interpolate_R2 = ReliableValueLayer(Nw, tao_r , Prowindow)
self.interpolate_R2_1 = ReliableWeightLayer(101*threshhold, sigma_d,tao_r, Prowindow)
self.training = training
self.threshold = threshhold
return
#self.lambda_e = lambda_e
#self.lambda_v = lambda_v
#self.sigma_d = sigma_d
#self.Nw = Nw
#self.tao_r = tao_r #maybe not useable
#self.Prowindow = Prowindow
# lambda_e = self.lambda_e , lambda_v = self.lambda_v,Nw = self.Nw
# sigma_d = self.sigma_d, tao_r = self.tao_r , Prowindow = self.Prowindow
#self.sigma_d, self.tao_r , self.Prowindow
# input1 ==> ref1 image
# #input2 ==> ref2 image
# input3 ==> ref1 flow
# input4 ==> ref2 flow
def forward(self, input1, input2, input3, input4):
epsilon = 1e-6
#flow1_grad = torch.sum(torch.sqrt(
# (input3[:, :, :-1, :-1] - input3[:, :, 1:, :-1]) ** 2 +
# (input3[:, :, :-1, :-1] - input3[:, :, :-1, 1:]) ** 2 + epsilon * epsilon
# ), dim = 1,keepdim =True)
#flow1_grad = self.padder1(flow1_grad)
# if input1.is_cuda:
# err = gradcheck(self.calc_weight1,(Variable(input1.data,requires_grad=True),
# Variable(input2 .data,requires_grad=True),
# Variable(input3.data,requires_grad= True),
# # Variable(flow1_grad.data,requires_grad=True)
# ), eps=1e-3)
# print(err)
# pass
#input1.requires_grad = True
#input2.requires_grad = True
flow_weight1 = self.calc_weight1(input1,input2,input3 )
# if flow1_grad.is_cuda:
# err = gradcheck(self.interpolate1,(Variable(input1.data,requires_grad=True),
# Variable(input3.data,requires_grad= True),
# Variable(flow_weight1.data,requires_grad=True)), eps=1e-3)
# err = gradcheck(self.interpolate1_1, (Variable(input3.data,requires_grad=True),
# Variable(flow_weight1.data, requires_grad =True)),eps=1e-3)
# err = gradcheck(self.interpolate_R1_1,(input3,),eps=1e-3)
# print(err)
# print(flow_weight1[0,:,50:100,50:100])
p1 = self.interpolate1(input1, input3, flow_weight1)
p1_r,p1_g,p1_b = torch.split(p1,1,dim=1)
pw1 = self.interpolate1_1(input3, flow_weight1)
i1_r,i1_g,i1_b = (p1_r)/(pw1+self.threshold),\
(p1_g)/(pw1+self.threshold), \
(p1_b)/(pw1+self.threshold)
#if not self.training:
# i1_r[pw1<=10*self.threshold], i1_g[pw1<=10*self.threshold], i1_b[pw1<=10*self.threshold] = 0,0,0
#i1 = torch.cat((i1_r,i1_g,i1_b),dim=1
#r1 = self.interpolate_R1(input3, flow_weight1)
r1 = pw1
rw1 = self.interpolate_R1_1(input3)
w1 = (r1)/(rw1+self.threshold)
# if torch.sum(w1 <= 0).cpu().data.numpy()[0] > 0:
# pass
# print("there are holes in i1 :" )
# print(torch.sum(w1 <= 0))
#if not self.training:
# w1[rw1 <=10*self.threshold] = 0
# flow2_grad = torch.sum(torch.sqrt(
# (input4[:, :, :-1, :-1] - input4[:, :, 1:, :-1]) ** 2 +
# (input4[:, :, :-1, :-1] - input4[:, :, :-1, 1:]) ** 2 + epsilon * epsilon
# ), dim = 1,keepdim=True)
# flow2_grad = self.padder2(flow2_grad)
flow_weight2 = self.calc_weight2(input2,input1,input4)
p2 = self.interpolate2(input2, input4, flow_weight2)
p2_r,p2_g,p2_b = torch.split(p2,1,dim=1)
pw2 = self.interpolate2_1(input4, flow_weight2)
i2_r,i2_g,i2_b = (p2_r)/(pw2+self.threshold),\
(p2_g)/(pw2+self.threshold), \
(p2_b)/(pw2+self.threshold)
#if not self.training:
# i2_r[pw2<=10*self.threshold], i2_g[pw2<=10*self.threshold], i2_b[pw2<=10*self.threshold] = 0,0,0
#i2 = torch.cat((p2[:,0,...] /pw2, p2[:,1,...] /pw2, p2[:,2,...]/pw2),dim=1)
#r2 = self.interpolate_R2(input4, flow_weight2)
r2 = pw2
rw2 = self.interpolate_R2_1(input4)
w2 = (r2)/(rw2+self.threshold)
#if torch.sum(w2 <= 0).cpu().data.numpy()[0] > 0:
# pass
# print("there are holes in i2 :" )
# print(torch.sum(w2 <= 0))
#if not self.training:
# w2[rw2 <= 10*self.threshold] = 0
# i = (i1 * w1 + i2 * w2 )/ (w1 + w2)
w = w1+w2
i_r = (i1_r * w1 + i2_r * w2)/ (w + self.threshold) #(w1 + w2)
i_g = (i1_g * w1 + i2_g * w2)/ (w + self.threshold) #(w1 + w2)
i_b = (i1_b * w1 + i2_b * w2)/ (w + self.threshold) #(w1 + w2)
#if torch.sum(w <= 0).cpu().data.numpy()[0] > 0:
# print("there are holes in i :")
# print(torch.sum(w <= 0))
if not self.training:
i_r[w<= 10*self.threshold], i_g[w<=10*self.threshold], i_b[w<=10*self.threshold] = 0,0,0
w[w <= 10 *self.threshold] = 0
i = torch.cat((i_r,i_g,i_b),dim=1)
return i
================================================
FILE: my_package/FilterInterpolation/__init__.py
================================================
from .FilterInterpolationModule import *
================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "filterinterpolation_cuda_kernel.cuh"
int FilterInterpolationLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& input3,
at::Tensor& output
)
{
int error = 1 ;
int channel = input1.size( 1);
//if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int filter_size2 = input3.size( 1);
int filter_size = (int) sqrt((float) filter_size2);
// printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2));
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int input3_b_stride = input3.stride(0);
int input3_c_stride = input3.stride(1);
int input3_h_stride = input3.stride(2);
int input3_w_stride = input3.stride(3);
// printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);
//TODO: do we need to assert the w_stride to be 1
if(input1_w_stride !=1) return error;
if(input2_w_stride !=1) return error;
if(input3_w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
error = FilterInterpolationLayer_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch, filter_size,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
input1,
input2,
input3,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int FilterInterpolationLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& input3,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2,
at::Tensor& gradinput3
)
{
int error = 1 ;
int channel = input1.size( 1);
//if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int filter_size2 = input3.size( 1);
int filter_size = (int) sqrt((float) filter_size2);
// printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2));
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int input3_b_stride = input3.stride(0);
int input3_c_stride = input3.stride(1);
int input3_h_stride = input3.stride(2);
int input3_w_stride = input3.stride(3);
// printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);
//TODO: do we need to assert the w_stride to be 1
if(input1_w_stride !=1) return error;
if(input2_w_stride !=1) return error;
if(input3_w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input2_b_stride != gradinput2.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
if(input2_c_stride != gradinput2.stride(1)) return error;
if(input3_c_stride != gradinput3.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = FilterInterpolationLayer_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch, filter_size,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
input1,
input2,
input3,
gradoutput,
gradinput1,
gradinput2,
gradinput3
);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("FilterInterpolationLayer_gpu_forward", &FilterInterpolationLayer_gpu_forward, "FilterInterpolation forward (CUDA)");
m.def("FilterInterpolationLayer_gpu_backward", &FilterInterpolationLayer_gpu_backward, "FilterInterpolation backward (CUDA)");
}
================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cu
================================================
#include
#include "filterinterpolation_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void FilterInterpolationLayer_gpu_forward_kernelfunc(
const int nElement,
const int w, const int h, const int channel, const int filter_size,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3, scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ];
float x2 = (float)(w_i) + fx;
float y2 = (float)(h_i) + fy;
if(x2 >= 0.0f && y2 >=0.0f && x2 <= (float)(w -1) && y2 <= (float)(h-1)
&& fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){
int ix2_L = int(x2) + 1 - (int)(filter_size / 2);
int iy2_T = int(y2) + 1 - (int)(filter_size / 2);
int ix2_R = ix2_L + filter_size;
int iy2_B = iy2_T + filter_size;
float alpha = x2 - (int)(x2);
float beta = y2 - (int)(y2);
//TODO: here is a bug that if the iy2_B or ix2_R gets out of the border, than there is no enough pixels to warp the target one.
for (int c_i = 0 ; c_i < channel ; c_i++){
float TL = 0.0f;
for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
int _filter_j = min(max(0, filter_j), h - 1);
for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
int _filter_i = min(max(0, filter_i ), w - 1);
TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
}
}
float TR = 0.0f;
for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BL = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BR = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] =
(1-alpha)*(1-beta)*TL +
alpha*(1-beta)*TR +
(1-alpha)*beta*BL +
alpha*beta*BR;
// for( int filter_i = ix2_L; filter_i < ix2_R ; filter_i ++ ){
// int _filter_i = min(max(0, filter_i),w - 1);
// output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] +=
// input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] *
// input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] *
//// exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)); // the distance weight
// exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ); // the distance weight
//
//// if(w_i == 141 && h_i == 316 && c_i == 0 ){
////printf("gpu: %f, %f,%f,%f\n",input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] ,
////input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i],
////exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)),
////output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ]
//// );
////}
//
// }
// }
}
} else{
//the warping data is out of range, we fill it with zeros
for(int c_i = 0 ; c_i < channel; c_i ++){
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = input1[off + c_i* input1_c_stride+ h_i * input1_h_stride + w_i];
}
}
}
return ;
}
template
__global__ void FilterInterpolationLayer_gpu_backward_kernelfunc(
const int nElement, const int w, const int h, const int channel, const int filter_size,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2, const scalar_t* __restrict__ input3,
scalar_t* gradoutput, scalar_t* gradinput1, scalar_t* gradinput2, scalar_t* gradinput3
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i];
float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i];
float x2 = float(w_i) + fx;
float y2 = float(h_i) + fy;
if(x2 >= 0.0f && y2 >= 0.0f && x2 <= (float)(w - 1) && y2 <= (float)(h -1)
&& fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){
int ix2_L = int(x2) + 1 - (int) (filter_size/2);
int iy2_T = int(y2) + 1 - (int) (filter_size/2);
int ix2_R = ix2_L + filter_size;
int iy2_B = iy2_T + filter_size;
float alpha = x2 - (int)(x2);
float beta = y2 - (int)(y2);
/***
Step 1: calculate the gradients for input1, i.e. the input image;
***/
/***
STEP 3: calculate the gradients for input3, i.e. the filter
***/
/***
Step 1 and Step 3 are simultaneously computed
***/
for (int c_i = 0 ; c_i < channel; c_i++){
float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
float TL_grad = gradoutput_value * (1-alpha ) * (1-beta);
for(int filter_j = iy2_T; filter_j <= (int) (y2) ; filter_j ++ ){
int _filter_j = min(max(0, filter_j), h - 1);
for (int filter_i = ix2_L ; filter_i <= (int)(x2) ; filter_i ++){
int _filter_i = min(max(0, filter_i), w - 1);
atomicAdd( &gradinput1[off +c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
TL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i]);
atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i],
TL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);
}
}
float TR_grad= gradoutput_value * alpha * ( 1- beta);
for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
TR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i]);
atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i],
TR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);
}
}
float BL_grad = gradoutput_value * ( 1 - alpha ) * beta;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
BL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i]);
atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i],
BL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);
}
}
float BR_grad = gradoutput_value * alpha * beta;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
BR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i]);
atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
input3_c_stride + h_i * input3_h_stride + w_i],
BR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);
}
}
// for ( int filter_j = iy2_T; filter_j < iy2_B ; filter_j ++ ){
// int _filter_j = min(max(0, filter_j), h - 1);
// for( int filter_i = ix2_L; filter_i< ix2_R ; filter_i++){
// int _filter_i = min(max(0,filter_i), w - 1);
// atomicAdd( & gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i],
// gradoutput_value *
// input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i] *
//// exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2))/(float)filter_size)
// exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2)))
//
// );
// }
// }
}
/***
Step 2: calculate the gradients for input2, i.e., the optical flow,
STEP 2.1: for the x/horizonotal direction.
***/
float gamma = 1.0f - beta; //iy2_B - y2;
float bot_diff = 0.0f;
for(int c_i =0 ; c_i< channel; c_i ++ ){
float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
float TL = 0.0f;
for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
int _filter_j = min(max(0, filter_j), h - 1);
for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
int _filter_i = min(max(0, filter_i ), w - 1);
TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
}
}
float TR = 0.0f;
for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BL = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BR = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float temp = 0.0f;
temp += gamma * (TR - TL);
temp += (1-gamma) * (BR - BL);
bot_diff += gradoutput_value * temp;
// for( int filter_j = iy2_T; filter_j< iy2_B; filter_j++){
// int _filter_j = min(max(0, filter_j) , h - 1);
// for( int filter_i = ix2_L; filter_i< ix2_R; filter_i ++){
// int _filter_i = min(max(0,filter_i), w-1);
//
// bot_diff +=
// gradoutput_value *
// input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
// input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i ] *
//// exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size) *
//// ((float) filter_i > x2 ? 1.0f : -1.0f) / (float)filter_size;
// exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))) *
// ((float) filter_i > x2 ? 1.0f : -1.0f);
// }
// }
}
//the gradients of the x direction/ horizontal direction
gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;
/***
STEP 2.2: for the x/horizonotal direction.
***/
gamma = 1.0f - alpha; //ix2_R -x2;
bot_diff = 0.0f;
for(int c_i = 0 ; c_i < channel; c_i ++ ){
float gradoutput_value = gradoutput [ off + c_i * input1_c_stride + h_i * input1_h_stride +w_i];
float TL = 0.0f;
for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
int _filter_j = min(max(0, filter_j), h - 1);
for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
int _filter_i = min(max(0, filter_i ), w - 1);
TL += input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
}
}
float TR = 0.0f;
for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BL = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float BR = 0.0f;
for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
}
}
float temp = 0.0f;
temp += gamma * (BL - TL);
temp += (1.0f - gamma) * ( BR - TR);
bot_diff += gradoutput_value * temp;
// for( int filter_j = iy2_T; filter_j < iy2_B; filter_j ++ ){
// int _filter_j = min(max(0, filter_j), h - 1);
// for( int filter_i = ix2_L; filter_i < ix2_R; filter_i ++){
// int _filter_i = min(max(0, filter_i), w - 1);
//
// bot_diff +=
// gradoutput_value *
// input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
// input3 [batch_i * input3_b_stride +((filter_j - iy2_T) * filter_size + ( filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i ] *
//// exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))/ (float)filter_size ) *
//// ((float) filter_j > y2 ? 1.0f : - 1.0f ) / (float)filter_size;
// exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ) *
// ((float) filter_j > y2 ? 1.0f : - 1.0f );
// }
// }
}
gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;
/***
STEP 3: calculate the gradients for input3, i.e. the filter
***/
// for(int c_i = 0 ; c_i >>(
nElement, //to let the nummous
w,h,channel,filter_size,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
input1.data(),input2.data(),input3.data(), output.data()
);
}));
// THCudaCheck(cudaGetLastError());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
int FilterInterpolationLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w, const int h, const int channel, const int batch, const int filter_size,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
at::Tensor& input1, at::Tensor& input2, at::Tensor& input3,
at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3
)
{
int error = 1 ;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
// cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));
// cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));
// cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
FilterInterpolationLayer_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,filter_size,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
input1.data(), input2.data(), input3.data(), gradoutput.data(),
gradinput1.data(), gradinput2.data(), gradinput3.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int FilterInterpolationLayer_gpu_forward_kernel(
cudaStream_t stream,
const int nElement,
const int w, const int h, const int channel, const int batch, const int filter_size,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
at::Tensor& input1, at::Tensor& input2, at::Tensor& input3, at::Tensor& output
);
int FilterInterpolationLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w, const int h, const int channel, const int batch, const int filter_size,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
at::Tensor& input1, at::Tensor& input2, at::Tensor& input3,
at::Tensor& gradoutput, at::Tensor& gradinput1, at::Tensor& gradinput2, at::Tensor& gradinput3
);
================================================
FILE: my_package/FilterInterpolation/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='filterinterpolation_cuda',
ext_modules=[
CUDAExtension('filterinterpolation_cuda', [
'filterinterpolation_cuda.cc',
'filterinterpolation_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/FlowProjection/FlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import flowprojection_cuda as my_lib
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py
class FlowProjectionLayer(Function):
def __init__(self,requires_grad):
super(FlowProjectionLayer,self).__init__()
self.requires_grad = requires_grad
@staticmethod
def forward(ctx, input1, requires_grad):
assert(input1.is_contiguous())
# assert(input2.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
fillhole = 1 if requires_grad == False else 0
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
# output = torch.zeros(input1.size())
if input1.is_cuda :
# output = output.cuda()
# count = count.cuda()
count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.FlowProjectionLayer_gpu_forward(input1, count,output, fillhole)
else:
output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.FlowProjectionLayer_cpu_forward(input1, count, output, fillhole)
if err != 0:
print(err)
# output = output/count # to divide the counter
ctx.save_for_backward(input1, count)
ctx.fillhole = fillhole
# self.count = count #to keep this
# print(self.input1[0, 0, :10, :10])
# print(self.count[0, 0, :10, :10])
# print(self.input1[0, 0, -10:, -10:])
# print(self.count[0, 0, -10:, -10:])
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Filter Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
input1, count, output = ctx.saved_tensors
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.FlowProjectionLayer_gpu_backward(input1, count, gradoutput, gradinput1)
# print(err)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.FlowProjectionLayer_cpu_backward(input1, count, gradoutput, gradinput1)
# print(err)
if err != 0:
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1, None
class FlowFillholelayer(Function):
def __init__(self):
super(FlowFillholelayer,self).__init__()
def forward(self, input1):
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
if input1.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
# count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
output = torch.zeros(input1.size())
if input1.is_cuda :
output = output.cuda()
# count = count.cuda()
err = my_lib.FlowFillholelayer_gpu_forward(input1, output)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.FlowFillholelayer_cpu_forward(input1, output)
if err != 0:
print(err)
# output = output/count # to divide the counter
# self.count = count #to keep this
# print(self.input1[0, 0, :10, :10])
# print(self.count[0, 0, :10, :10])
# print(self.input1[0, 0, -10:, -10:])
# print(self.count[0, 0, -10:, -10:])
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
# def backward(self, gradoutput):
# # print("Backward of Filter Interpolation Layer")
# # gradinput1 = input1.new().zero_()
# # gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
# if self.input1.is_cuda:
# # print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
# err = my_lib.FlowProjectionLayer_gpu_backward(self.input1, self.count, gradoutput, gradinput1)
# # print(err)
# if err != 0 :
# print(err)
#
# else:
# # print("CPU backward")
# # print(gradoutput)
# err = my_lib.FlowProjectionLayer_cpu_backward(self.input1, self.count, gradoutput, gradinput1)
# # print(err)
# if err != 0:
# print(err)
# # print(gradinput1)
# # print(gradinput2)
#
# # print(gradinput1)
#
# return gradinput1
================================================
FILE: my_package/FlowProjection/FlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn import Module
from .FlowProjectionLayer import FlowProjectionLayer #, FlowFillholeLayer
class FlowProjectionModule(Module):
def __init__(self, requires_grad = True):
super(FlowProjectionModule, self).__init__()
self.f = FlowProjectionLayer(requires_grad)
def forward(self, input1):
return self.f(input1)
# class FlowFillholeModule(Module):
# def __init__(self,hole_value = -10000.0):
# super(FlowFillholeModule, self).__init__()
# self.f = FlowFillholeLayer()
#
# def forward(self, input1):
# return self.f(input1)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/FlowProjection/__init__.py
================================================
from .FlowProjectionModule import *
================================================
FILE: my_package/FlowProjection/flowprojection_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "flowprojection_cuda_kernel.cuh"
int FlowProjectionLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& count,
at::Tensor& output,
int fillhole
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!= 2) return error;
int batch = input1.size(0);
int h = input1.size(2);
int w = input1.size(3);
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
// printf("In gpu forward\n");
error = FlowProjection_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch,fillhole,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
count,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int FlowProjectionLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& count,
at::Tensor& gradoutput,
at::Tensor& gradinput1
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=2) return error;
int batch = input1.size(0);
if(count.size(0) != batch) return error;
if(count.size(1) != 1) return error;
int h = input1.size(2);
int w = input1.size(3);
if(count.size(2) != h) return error;// to add some checkpoint
if(count.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
// printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = FlowProjection_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
count,
gradoutput,
gradinput1
);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("FlowProjectionLayer_gpu_forward", &FlowProjectionLayer_gpu_forward, "FlowProjection forward (CUDA)");
m.def("FlowProjectionLayer_gpu_backward", &FlowProjectionLayer_gpu_backward, "FlowProjection backward (CUDA)");
}
================================================
FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cu
================================================
#include
#include "flowprojection_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void FlowProjection_gpu_forward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];
float x2 = (float) (w_i) + fx;
float y2 = (float) (h_i) + fy;
if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
int ix2_L = (int) (x2);
int iy2_T = (int) (y2);
int ix2_R = min(ix2_L + 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,-fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-fx);
atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-fx);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] , -fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] , -fy);
atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] , -fy);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] , 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , 1);
atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] , 1);
}
}
return ;
}
template
__global__ void FlowProjectionAveraging_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
if(temp > 0.0f){
output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
}
}
return ;
}
template
__global__ void FlowFillhole_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
if(temp <= 0.0f){
//search along the four directions,0/90/180/270, until finding at least one
int left_offset = w_i; float left_temp = 0.0f;
while(left_temp == 0.0f && left_offset - 1 >= 0){
left_offset = left_offset - 1;
left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
}
int right_offset = w_i ; float right_temp = 0.0f;
while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
right_offset = right_offset + 1 ;
right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
}
int up_offset = h_i ; float up_temp = 0.0f;
while(up_temp == 0.0f && up_offset - 1 >=0){
up_offset = up_offset - 1;
up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
}
int down_offset = h_i; float down_temp = 0.0f;
while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
down_offset = down_offset + 1;
down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
}
if(left_temp + right_temp + up_temp + down_temp <=0.0f){
//printf("Can't fill hole, find no neighbor vectors availabel\n");
return;
}
left_temp = (left_temp > 0.0f)?1:0;
right_temp = (right_temp > 0.0f)?1:0;
up_temp = (up_temp > 0.0f)?1:0;
down_temp = (down_temp > 0.0f)?1:0;
output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
}
}
return ;
}
template
__global__ void FlowProjection_gpu_backward_kernelfunc(
const int nElement, const int w, const int h, const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1,
const scalar_t* __restrict__ count,
const scalar_t* __restrict__ gradoutput,
scalar_t* gradinput1
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float x2 = (float) ( w_i ) + fx;
float y2 = (float) ( h_i ) + fy;
if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
int ix2_L = (int)(x2);
int iy2_T = (int)(y2);
int ix2_R = min(ix2_L + 1, w-1);
int iy2_B = min(iy2_T + 1, h-1);
int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/
count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ;
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]/
count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ;
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/
count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ;
int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]/
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/
count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ;
}
}
return ;
}
int FlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& count,
at::Tensor& output
)
{
int error = 1 ;
dim3 grid;
dim3 block;
// blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
// printf("I am here\n");
//extract the data of CudaTensor and use kernel to calculate.
AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_forward_kernelfunc", ([&] {
FlowProjection_gpu_forward_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),count.data(),output.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am there\n");
AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjectionAveraging_kernelfunc", ([&] {
FlowProjectionAveraging_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),count.data(),output.data()
);
}));
// printf("I am kao\n");
// THCudaCheck(cudaGetLastError());
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am dd\n");
if(fillhole){
// printf("use flow fill hole\n");
AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowFillhole_kernelfunc", ([&] {
FlowFillhole_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),count.data(),output.data()
);
}));
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
return error;
}
}
error = 0;
return error;
}
int FlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& count,
at::Tensor& gradoutput,
at::Tensor& gradinput1
)
{
int error = 1 ;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_backward_kernelfunc", ([&] {
FlowProjection_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),
count.data(),
gradoutput.data(),
gradinput1.data()
);
}));
// printf("gpu I am there\n");
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("gpu I am here\n");
error = 0;
return error;
}
================================================
FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int FlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& count,
at::Tensor& output
);
int FlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& count,
at::Tensor& gradoutput,
at::Tensor& gradinput1
);
================================================
FILE: my_package/FlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='flowprojection_cuda',
ext_modules=[
CUDAExtension('flowprojection_cuda', [
'flowprojection_cuda.cc',
'flowprojection_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/Interpolation/InterpolationLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import interpolation_cuda as my_lib
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py
class InterpolationLayer(Function):
def __init__(self):
super(InterpolationLayer,self).__init__()
@staticmethod
def forward(ctx, input1,input2):
assert(input1.is_contiguous())
assert(input2.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
# self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# output = torch.zeros(input1.size())
if input1.is_cuda :
# output = output.cuda()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
my_lib.InterpolationLayer_gpu_forward(input1, input2, output)
else:
output = torch.cuda.FloatTensor(input1.data.size())
my_lib.InterpolationLayer_cpu_forward(input1, input2, output)
ctx.save_for_backward(input1, input2)
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
# gradinput2 = torch.zeros(self.input2.size())
input1, input2 = ctx.saved_tensors
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
# gradinput2 = gradinput2.cuda(self.device)
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
# the input1 image should not require any gradients
# print("Does input1 requires gradients? " + str(self.input1.requires_grad))
err = my_lib.InterpolationLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.InterpolationLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1, gradinput2
================================================
FILE: my_package/Interpolation/InterpolationModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from .InterpolationLayer import InterpolationLayer
class InterpolationModule(Module):
def __init__(self):
super(InterpolationModule, self).__init__()
# self.f = InterpolationLayer()
def forward(self, input1, input2):
return InterpolationLayer.apply(input1, input2)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/Interpolation/__init__.py
================================================
from .InterpolationModule import *
================================================
FILE: my_package/Interpolation/interpolation_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "interpolation_cuda_kernel.cuh"
int InterpolationLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
error =InterpolationLayer_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(),
nElement,w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1,
input2,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int InterpolationLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input2_b_stride != gradinput2.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
if(input2_c_stride != gradinput2.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = InterpolationLayer_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1,
input2,
gradoutput,
gradinput1,
gradinput2
);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("InterpolationLayer_gpu_forward", &InterpolationLayer_gpu_forward, "Interpolation forward (CUDA)");
m.def("InterpolationLayer_gpu_backward", &InterpolationLayer_gpu_backward, "Interpolation backward (CUDA)");
}
================================================
FILE: my_package/Interpolation/interpolation_cuda_kernel.cu
================================================
#include
#include "interpolation_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void InterpolationLayer_gpu_forward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const scalar_t* __restrict__ input1,
const scalar_t* __restrict__ input2,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ];
float x2 = (float)(w_i) + fx;
float y2 = (float)(h_i) + fy;
if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){
int ix2_L = int(x2);
int iy2_T = int(y2);
int ix2_R = min(ix2_L + 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float alpha = x2 - ix2_L;
float beta = y2 - iy2_T;
for(int c_i = 0 ; c_i < channel ; c_i ++){
float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];
float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];
float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =
(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;
}
} else{
//the warping data is out of range, we fill it with zeros
for(int c_i = 0 ; c_i < channel; c_i ++){
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;
}
}
}
return ;
}
template
__global__ void InterpolationLayer_gpu_backward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const scalar_t* __restrict__ input1,
const scalar_t* __restrict__ input2,
const scalar_t* __restrict__ gradoutput,
scalar_t* gradinput1,
scalar_t* gradinput2
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];
float x2 = float(w_i) + fx;
float y2 = float(h_i) + fy;
if(x2 >= 0.0f && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){
int ix2_L = int(x2);
int iy2_T = int(y2);
int ix2_R = min(ix2_L+ 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float alpha = x2 - ix2_L;
float beta = y2 - iy2_T;
for (int c_i = 0 ; c_i < channel; c_i++){
float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);
}
float gamma = iy2_B - y2;
float bot_diff = 0.0f;
for(int c_i =0 ; c_i< channel; c_i ++ ){
float temp = 0;
temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -
input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);
temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -
input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);
float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];
bot_diff += warped_diff_value * temp ;
}
//the gradients of the x direction/ horizontal direction
gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;
gamma = ix2_R- x2;
bot_diff = 0.0f;
for(int c_i = 0 ; c_i < channel;c_i ++ ){
float temp = 0.0f;
temp += gamma * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -
input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);
temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -
input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);
float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
bot_diff += warped_diff_value * temp;
}
gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;
}
}
return ;
}
int InterpolationLayer_gpu_forward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
)
{
int error = -1;
dim3 grid;
dim3 block;
// blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
//extract the data of CudaTensor and use kernel to calculate.
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {
InterpolationLayer_gpu_forward_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1.data(),input2.data(),output.data()
);
}));
// THCudaCheck(cudaGetLastError());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
int InterpolationLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = -1;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {
InterpolationLayer_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1.data(),
input2.data(),
gradoutput.data(),
gradinput1.data(),
gradinput2.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
================================================
FILE: my_package/Interpolation/interpolation_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int InterpolationLayer_gpu_forward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
);
int InterpolationLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
);
================================================
FILE: my_package/Interpolation/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='interpolation_cuda',
ext_modules=[
CUDAExtension('interpolation_cuda', [
'interpolation_cuda.cc',
'interpolation_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/InterpolationCh/InterpolationChLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import interpolationch_cuda as my_lib
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py
class InterpolationChLayer(Function):
def __init__(self,ch):
super(InterpolationChLayer,self).__init__()
self.ch = ch
@staticmethod
def forward(ctx, input1,input2):
assert(input1.is_contiguous())
assert(input2.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
# self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# output = torch.zeros(input1.size())
if input1.is_cuda :
# output = output.cuda()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
my_lib.InterpolationChLayer_gpu_forward(input1, input2, output)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
output = torch.FloatTensor().resize_(input1.size()).zero_()
my_lib.InterpolationChLayer_cpu_forward(input1, input2, output)
ctx.save_for_backward(input1, input2)
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
# gradinput2 = torch.zeros(self.input2.size())
input1, input2 = ctx.saved_tensors
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
# gradinput2 = gradinput2.cuda(self.device)
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
# the input1 image should not require any gradients
# print("Does input1 requires gradients? " + str(self.input1.requires_grad))
err = my_lib.InterpolationChLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.InterpolationChLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1, gradinput2
================================================
FILE: my_package/InterpolationCh/InterpolationChModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from .InterpolationChLayer import InterpolationChLayer
class InterpolationChModule(Module):
def __init__(self,ch):
super(InterpolationChModule, self).__init__()
self.ch = ch
# self.f = InterpolationChLayer(ch)
def forward(self, input1, input2):
return InterpolationChLayer.apply(input1, input2)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/InterpolationCh/__init__.py
================================================
from .InterpolationChModule import *
================================================
FILE: my_package/InterpolationCh/interpolationch_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "interpolationch_cuda_kernel.cuh"
int InterpolationChLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
)
{
int error = 1 ;
int channel = input1.size( 1);
// if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
error =InterpolationChLayer_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1,
input2,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int InterpolationChLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = 1 ;
int channel = input1.size( 1);
// if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != 2) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h) return error;// to add some checkpoint
if(input2.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input2_b_stride != gradinput2.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
if(input2_c_stride != gradinput2.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = InterpolationChLayer_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1,
input2,
gradoutput,
gradinput1,
gradinput2
);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("InterpolationChLayer_gpu_forward", &InterpolationChLayer_gpu_forward, "InterpolationCh forward (CUDA)");
m.def("InterpolationChLayer_gpu_backward", &InterpolationChLayer_gpu_backward, "InterpolationCh backward (CUDA)");
}
================================================
FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cu
================================================
#include
#include "interpolationch_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void InterpolationChLayer_gpu_forward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const scalar_t* __restrict__ input1,
const scalar_t* __restrict__ input2,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i ];
float x2 = (float)(w_i) + fx;
float y2 = (float)(h_i) + fy;
if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){
int ix2_L = int(x2);
int iy2_T = int(y2);
int ix2_R = min(ix2_L + 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float alpha = x2 - ix2_L;
float beta = y2 - iy2_T;
for(int c_i = 0 ; c_i < channel ; c_i ++){
float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];
float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];
float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =
(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;
}
} else{
//the warping data is out of range, we fill it with zeros
for(int c_i = 0 ; c_i < channel; c_i ++){
output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;
}
}
}
return ;
}
template
__global__ void InterpolationChLayer_gpu_backward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const scalar_t* __restrict__ input1,
const scalar_t* __restrict__ input2,
const scalar_t* __restrict__ gradoutput,
scalar_t* gradinput1,
scalar_t* gradinput2
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];
float x2 = float(w_i) + fx;
float y2 = float(h_i) + fy;
if(x2 >= 0.0f && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){
int ix2_L = int(x2);
int iy2_T = int(y2);
int ix2_R = min(ix2_L+ 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float alpha = x2 - ix2_L;
float beta = y2 - iy2_T;
for (int c_i = 0 ; c_i < channel; c_i++){
float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);
atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);
}
float gamma = iy2_B - y2;
float bot_diff = 0.0f;
for(int c_i =0 ; c_i< channel; c_i ++ ){
float temp = 0;
temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -
input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);
temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -
input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);
float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];
bot_diff += warped_diff_value * temp ;
}
//the gradients of the x direction/ horizontal direction
gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;
gamma = ix2_R- x2;
bot_diff = 0.0f;
for(int c_i = 0 ; c_i < channel;c_i ++ ){
float temp = 0.0f;
temp += gamma * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -
input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);
temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -
input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);
float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
bot_diff += warped_diff_value * temp;
}
gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;
}
}
return ;
}
int InterpolationChLayer_gpu_forward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
)
{
int error = 1 ;
dim3 grid;
dim3 block;
// blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
//extract the data of CudaTensor and use kernel to calculate.
AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_forward_kernelfunc", ([&] {
InterpolationChLayer_gpu_forward_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1.data(),input2.data(),output.data()
);
}));
// THCudaCheck(cudaGetLastError());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
int InterpolationChLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = 1 ;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_backward_kernelfunc", ([&] {
InterpolationChLayer_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input1.data(),
input2.data(),
gradoutput.data(),
gradinput1.data(),
gradinput2.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
error = 0;
return error;
}
================================================
FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int InterpolationChLayer_gpu_forward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& output
);
int InterpolationChLayer_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
);
================================================
FILE: my_package/InterpolationCh/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='interpolationch_cuda',
ext_modules=[
CUDAExtension('interpolationch_cuda', [
'interpolationch_cuda.cc',
'interpolationch_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/MinDepthFlowProjection/__init__.py
================================================
from .minDepthFlowProjectionModule import *
================================================
FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
#import _ext.my_lib as my_lib
import mindepthflowprojection_cuda as my_lib
class minDepthFlowProjectionLayer(Function):
def __init__(self,requires_grad):
super(minDepthFlowProjectionLayer,self).__init__()
# self.requires_grad = requires_grad
@staticmethod
def forward(ctx, input1, input2, requires_grad):
# print("Depth Aware Flow Projection")
assert(input1.is_contiguous())
assert(input2.is_contiguous())
# self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
# self.input2 = input2.contiguous()
fillhole = 1 if requires_grad == False else 0
# if input1.is_cuda:
# self.device = torch.cuda.current_device()
# else:
# self.device = -1
# count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
# output = torch.zeros(input1.size())
if input1.is_cuda:
# output = output.cuda()
# count = count.cuda()
# print("correct")
count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.minDepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
output = torch.FloatTensor().resize_(input1.size()).zero_()
err = my_lib.minDepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)
if err != 0:
print(err)
# output = output/count # to divide the counter
# self.count = count #to keep this
# self.output = output
ctx.save_for_backward(input1, input2,count,output)
ctx.fillhole = fillhole
# print(self.input1[0, 0, :10, :10])
# print(self.count[0, 0, :10, :10])
# print(self.input1[0, 0, -10:, -10:])
# print(self.count[0, 0, -10:, -10:])
# the function returns the output to its caller
return output
@staticmethod
def backward(ctx, gradoutput):
# print("Backward of Filter Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
# gradinput1 = torch.zeros(self.input1.size())
input1, input2, count, output = ctx.saved_tensors
# fillhole = ctx.fillhole
if input1.is_cuda:
# print("CUDA backward")
# gradinput1 = gradinput1.cuda(self.device)
gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.minDepthFlowProjectionLayer_gpu_backward(input1,input2,
count, output,
gradoutput, gradinput1,gradinput2)
# print(err)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
err = my_lib.minDepthFlowProjectionLayer_cpu_backward(input1, input2,
count, output,
gradoutput, gradinput1,gradinput2)
# print(err)
if err != 0:
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1,gradinput2,None
================================================
FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn.modules.module import Module
from .minDepthFlowProjectionLayer import minDepthFlowProjectionLayer #, FlowFillholeLayer
__all__ =['minDepthFlowProjectionModule']
class minDepthFlowProjectionModule(Module):
def __init__(self, requires_grad = True):
super(minDepthFlowProjectionModule, self).__init__()
self.requires_grad = requires_grad
# self.f = minDepthFlowProjectionLayer(requires_grad)
def forward(self, input1, input2):
return minDepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)
# class FlowFillholeModule(Module):
# def __init__(self,hole_value = -10000.0):
# super(FlowFillholeModule, self).__init__()
# self.f = FlowFillholeLayer()
#
# def forward(self, input1):
# return self.f(input1)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "mindepthflowprojection_cuda_kernel.cuh"
int minDepthFlowProjectionLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
int fillhole
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!= 2) return error;
int batch = input1.size(0);
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(1) !=1 ) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != output.stride(0)) return error;
if(input1_c_stride != output.stride(1)) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
// printf("In gpu forward\n");
error = minDepthFlowProjection_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch,fillhole,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
input2,
count,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int minDepthFlowProjectionLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=2) return error;
int batch = input1.size(0);
if(count.size( 0) != batch) return error;
if(count.size(1) != 1) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(1) !=1 ) return error;
if(count.size(2) != h) return error;// to add some checkpoint
if(count.size(3) != w) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int count_b_stride = count.stride(0);
int count_c_stride = count.stride(1);
int count_h_stride = count.stride(2);
int count_w_stride = count.stride(3);
//TODO: do we need to assert the w_stride to be 1
//if(w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
// printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = minDepthFlowProjection_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1,
input2,
count,
output,
gradoutput,
gradinput1,
gradinput2
);
if (error) {AT_ERROR("CUDA call failed");}
//printf("Am I good in backward function %d",error);
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("minDepthFlowProjectionLayer_gpu_forward", &minDepthFlowProjectionLayer_gpu_forward, "minDepthFlowProjection forward (CUDA)");
m.def("minDepthFlowProjectionLayer_gpu_backward", &minDepthFlowProjectionLayer_gpu_backward, "minDepthFlowProjection backward (CUDA)");
}
================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cu
================================================
#include
#include "mindepthflowprojection_cuda_kernel.cuh"
#include
#include
#include
#include
#define min(a,b) ((ab)?(a):(b))
#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;
//forward path of our layer
template
__global__ void minDepthFlowProjection_gpu_forward_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];
float x2 = (float) (w_i) + fx;
float y2 = (float) (h_i) + fy;
if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
int ix2_L = (int) (x2);
int iy2_T = (int) (y2);
int ix2_R = min(ix2_L + 1, w - 1);
int iy2_B = min(iy2_T + 1, h - 1);
float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];
float old_exist = 0;
//while(1){
old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L];
if(temp > old_exist){
output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] = - fx; //update the new vector
output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] = - fy;
count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] = temp; // update to the best weight
//if ( count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] == temp){
//break;
//}
}
//}
// old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ;
// if(temp > old_exist){
// output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]= - fx;
// output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] = - fy;
// count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]= temp ;
// }
// old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L];
// if(temp > old_exist){
// output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] = - fx;
// output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] = - fy;
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] = temp;
// }
// old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R];
// if(temp> old_exist){
// output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] = - fx;
// output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] = - fy;
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] = temp;
// }
}
}
return ;
}
template
__global__ void minDepthFlowFillhole_kernelfunc(
const int nElement,
const int w,
const int h,
const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
//only use one dimensioon of the grid and block
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
// const float fillvalue =0.0f;
if( withinXbounds && withinYbounds) {
float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
if(temp <= 0.0f){
//search along the four directions,0/90/180/270, until finding at least one
int left_offset = w_i; float left_temp = 0.0f;
while(left_temp == 0.0f && left_offset - 1 >= 0){
left_offset = left_offset - 1;
left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
}
int right_offset = w_i ; float right_temp = 0.0f;
while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
right_offset = right_offset + 1 ;
right_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
}
int up_offset = h_i ; float up_temp = 0.0f;
while(up_temp == 0.0f && up_offset - 1 >=0){
up_offset = up_offset - 1;
up_temp = count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
}
int down_offset = h_i; float down_temp = 0.0f;
while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
down_offset = down_offset + 1;
down_temp = count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
}
if(left_temp + right_temp + up_temp + down_temp <=0.0f){
//printf("Can't fill hole, find no neighbor vectors availabel\n");
return;
}
left_temp = (left_temp > 0.0f)?1:0;
right_temp = (right_temp > 0.0f)?1:0;
up_temp = (up_temp > 0.0f)?1:0;
down_temp = (down_temp > 0.0f)?1:0;
output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
left_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
left_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
right_temp * output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
up_temp * output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
down_temp * output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
)/(
left_temp + right_temp + up_temp + down_temp
) ;
}
}
return ;
}
template
__global__ void minDepthFlowProjection_gpu_backward_kernelfunc(
const int nElement, const int w, const int h, const int channel,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
const scalar_t* __restrict__ input1, const scalar_t* __restrict__ input2,
scalar_t* count,
scalar_t* output,
const scalar_t* __restrict__ gradoutput,
scalar_t* gradinput1,
scalar_t* gradinput2
)
{
//blockIdx.z : batch index from 0~B-1
//blockIdx.y : height patch index from ceil(h/16)
//blockIdx.x : width patch index from ceil(w/32)
//threadidx.x: width index 0~31
//threadIdx.y: height index 0~15
//threadIdx.z: Not used
const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
const bool withinXbounds = w_i < w;
const bool withinYbounds = h_i < h;
const int batch_i = blockIdx.z;
const int off = batch_i * input1_b_stride;
// __syncthreads();
if(withinXbounds && withinYbounds){
float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;
float x2 = (float) ( w_i ) + fx;
float y2 = (float) ( h_i ) + fy;
if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
int ix2_L = (int)(x2);
int iy2_T = (int)(y2);
int ix2_R = min(ix2_L + 1, w-1);
int iy2_B = min(iy2_T + 1, h-1);
float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];
int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
if(temp == count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ){
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] ;
}
if(temp == count[batch_i * count_b_stride +0 + iy2_T * count_h_stride + ix2_R] ){
gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] ;
}
if(temp==count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ){
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;
}
if(temp == count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ){
gradinput1[iu_offset ] += - gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] ;
gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
}
//int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;
//gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
// count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
// (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
//gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
// count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
// (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
//gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
// (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
//gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
// (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
//gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
// count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
// (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
//gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
// count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
// (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
//gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
// (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
//gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
// count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
// (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
}
}
return ;
}
int minDepthFlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count,
at::Tensor& output
)
{
int error = -1;
dim3 grid;
dim3 block;
// blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
// printf("I am here\n");
//extract the data of CudaTensor and use kernel to calculate.
AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_forward", ([&] {
minDepthFlowProjection_gpu_forward_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data()
);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am there\n");
// THCudaCheck(cudaGetLastError());
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("I am dd\n");
if(fillhole){
// printf("use flow fill hole\n");
AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowFillhole", ([&] {
minDepthFlowFillhole_kernelfunc<<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data()
);
}));
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
return error;
}
}
error = 0;
return error;
}
int minDepthFlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count, at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
)
{
int error = -1;
dim3 grid;
dim3 block;
//blockthread = 128;
//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
//the three channels are processsed in one kernel
block = dim3(BLOCKDIMX,BLOCKDIMY,1);
grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_backward", ([&] {
minDepthFlowProjection_gpu_backward_kernelfunc <<>>(
nElement, //to let the nummous
w,h,channel,
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
count_b_stride,count_c_stride,count_h_stride,count_w_stride,
input1.data(),input2.data(),count.data(),output.data(),
gradoutput.data(), gradinput1.data(), gradinput2.data()
);
}));
// printf("gpu I am there\n");
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
//THError("aborting");
return error;
}
// printf("gpu I am here\n");
error = 0;
return error;
}
================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cuh
================================================
#pragma once
#include
#include
#include
int minDepthFlowProjection_gpu_forward_kernel(
cudaStream_t stream, const int nElement,
const int w, const int h, const int channel, const int batch, const int fillhole,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1, at::Tensor& input2,
at::Tensor& count,
at::Tensor& output
);
int minDepthFlowProjection_gpu_backward_kernel(
cudaStream_t stream,
const int nElement,
const int w,
const int h,
const int channel,
const int batch,
const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& count,
at::Tensor& output,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2
);
================================================
FILE: my_package/MinDepthFlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch
from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from compiler_args import nvcc_args, cxx_args
setup(
name='mindepthflowprojection_cuda',
ext_modules=[
CUDAExtension('mindepthflowprojection_cuda', [
'mindepthflowprojection_cuda.cc',
'mindepthflowprojection_cuda_kernel.cu'
], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
],
cmdclass={
'build_ext': BuildExtension
})
================================================
FILE: my_package/SeparableConv/SeparableConvLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import _ext.my_lib as my_lib
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py
class SeparableConvLayer(Function):
def __init__(self,filtersize):
self.filtersize = filtersize
super(SeparableConvLayer,self).__init__()
def forward(self, input1,input2,input3):
intBatches = input1.size(0)
intInputDepth = input1.size(1)
intInputHeight = input1.size(2)
intInputWidth = input1.size(3)
intFilterSize = min(input2.size(1), input3.size(1))
intOutputHeight = min(input2.size(2), input3.size(2))
intOutputWidth = min(input2.size(3), input3.size(3))
assert(intInputHeight - self.filtersize == intOutputHeight - 1)
assert(intInputWidth - self.filtersize == intOutputWidth - 1)
assert(intFilterSize == self.filtersize)
assert(input1.is_contiguous() == True)
assert(input2.is_contiguous() == True)
assert(input3.is_contiguous() == True)
output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_()
# assert(input1.is_contiguous())
# assert(input2.is_contiguous())
self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
self.input3 = input3.contiguous()
if input1.is_cuda:
self.device = torch.cuda.current_device()
else:
self.device = -1
if input1.is_cuda :
output = output.cuda()
err = my_lib.SeparableConvLayer_gpu_forward(input1, input2,input3, output)
else:
# output = torch.cuda.FloatTensor(input1.data.size())
err = my_lib.SeparableConvLayer_cpu_forward(input1, input2,input3, output)
if err != 0:
print(err)
# the function returns the output to its caller
return output
#TODO: if there are multiple outputs of this function, then the order should be well considered?
def backward(self, gradoutput):
# print("Backward of Interpolation Layer")
# gradinput1 = input1.new().zero_()
# gradinput2 = input2.new().zero_()
gradinput1 = torch.zeros(self.input1.size())
gradinput2 = torch.zeros(self.input2.size())
gradinput3 = torch.zeros(self.input3.size())
if self.input1.is_cuda:
# print("CUDA backward")
gradinput1 = gradinput1.cuda(self.device)
gradinput2 = gradinput2.cuda(self.device)
gradinput3 = gradinput3.cuda(self.device)
# the input1 image should not require any gradients
# print("Does input1 requires gradients? " + str(self.input1.requires_grad))
err = my_lib.SeparableConvLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)
if err != 0 :
print(err)
else:
# print("CPU backward")
# print(gradoutput)
err = my_lib.SeparableConvLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)
# print(err)
if err != 0 :
print(err)
# print(gradinput1)
# print(gradinput2)
# print(gradinput1)
return gradinput1, gradinput2,gradinput3
================================================
FILE: my_package/SeparableConv/SeparableConvModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from functions.SeparableConvLayer import SeparableConvLayer
class SeparableConvModule(Module):
def __init__(self,filtersize):
super(SeparableConvModule, self).__init__()
self.f = SeparableConvLayer(filtersize)
def forward(self, input1, input2, input3):
return self.f(input1, input2, input3)
#we actually dont need to write the backward code for a module, since we have
================================================
FILE: my_package/SeparableConv/__init__.py
================================================
from .SeparableConvModule import *
================================================
FILE: my_package/SeparableConv/separableconv_cuda.cc
================================================
#include
#include
#include
#include
#include //works for 1.0.0
#include "separableconv_cuda_kernel.cuh"
int SeparableConvLayer_gpu_forward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& input3,
at::Tensor& output
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != input3.size(1)) return error; //change by zhenghe, am I right?
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
if(input2.size(3) != w - input2.size(1) + 1) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int input3_b_stride = input3.stride(0);
int input3_c_stride = input3.stride(1);
int input3_h_stride = input3.stride(2);
int input3_w_stride = input3.stride(3);
int output_b_stride = output.stride(0);
int output_c_stride = output.stride(1);
int output_h_stride = output.stride(2);
int output_w_stride = output.stride(3);
// printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);
//TODO: do we need to assert the w_stride to be 1
if(input1_w_stride !=1) return error;
if(input2_w_stride !=1) return error;
if(input3_w_stride !=1) return error;
if(output_w_stride !=1) return error;
if(input2_b_stride != input3_b_stride) return error;
if(input2_c_stride != input3_c_stride) return error;
int nElement = 0;//UNUSED THCudaTensor_nElement(state, output);
error = SeparableConvLayer_gpu_forward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement,w,h,channel,batch, input2.size(1),
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
output_b_stride,output_c_stride,output_h_stride,output_w_stride,
input1,
input2,
input3,
output);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
int SeparableConvLayer_gpu_backward(
at::Tensor& input1,
at::Tensor& input2,
at::Tensor& input3,
at::Tensor& gradoutput,
at::Tensor& gradinput1,
at::Tensor& gradinput2,
at::Tensor& gradinput3
)
{
int error = 1 ;
int channel = input1.size( 1);
if(channel!=3) return error;
int batch = input1.size(0);
if(input2.size( 0) != batch) return error;
if(input2.size(1) != input2.size(1)) return error;
int h = input1.size(2);
int w = input1.size(3);
if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
if(input2.size(3) != w - input2.size(1) + 1) return error;
int input1_b_stride = input1.stride(0);
int input1_c_stride = input1.stride(1);
int input1_h_stride = input1.stride(2);
int input1_w_stride = input1.stride(3);
int input2_b_stride = input2.stride(0);
int input2_c_stride = input2.stride(1);
int input2_h_stride = input2.stride(2);
int input2_w_stride = input2.stride(3);
int input3_b_stride = input3.stride(0);
int input3_c_stride = input3.stride(1);
int input3_h_stride = input3.stride(2);
int input3_w_stride = input3.stride(3);
int output_b_stride = gradoutput.stride(0);
int output_c_stride = gradoutput.stride(1);
int output_h_stride = gradoutput.stride(2);
int output_w_stride = gradoutput.stride(3);
// printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);
//TODO: do we need to assert the w_stride to be 1
if(input1_w_stride !=1) return error;
if(input2_w_stride !=1) return error;
if(input3_w_stride !=1) return error;
if(output_w_stride !=1) return error;
if(input1_b_stride != gradinput1.stride(0)) return error;
if(input2_b_stride != gradinput2.stride(0)) return error;
if(input1_c_stride != gradinput1.stride(1)) return error;
if(input2_c_stride != gradinput2.stride(1)) return error;
if(input3_c_stride != gradinput3.stride(1)) return error;
// printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
int nElement = 0;//UNUSED THCudaTensor_nElement(state, gradoutput);
error = SeparableConvLayer_gpu_backward_kernel(
// at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
at::cuda::getCurrentCUDAStream(), //works for 1.0.0
nElement, //to let the nummous
w,h,channel,batch, input2.size(1),
input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
output_b_stride,output_c_stride,output_h_stride,output_w_stride,
input1,
input2,
input3,
gradoutput,
gradinput1,
gradinput2,
gradinput3
);
if (error) {AT_ERROR("CUDA call failed");}
return error;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("SeparableConvLayer_gpu_forward", &SeparableConvLayer_gpu_forward, "SeparableConv forward (CUDA)");
m.def("SeparableConvLayer_gpu_backward", &SeparableConvLayer_gpu_backward, "SeparableConv backward (CUDA)");
}
================================================
FILE: my_package/SeparableConv/separableconv_cuda_kernel.cu
================================================
#include
#include "separableconv_cuda_kernel.cuh"
#include