Repository: baowenbo/DAIN
Branch: master
Commit: 7c727aca5676
Files: 123
Total size: 560.1 KB

Directory structure:
gitextract_7t87l58_/

├── .gitignore
├── AverageMeter.py
├── Colab_DAIN.ipynb
├── LICENSE
├── MegaDepth/
│   ├── LICENSE
│   ├── MegaDepth_model.py
│   ├── README.md
│   ├── SDR_compute.py
│   ├── __init__.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── aligned_data_loader.py
│   │   ├── base_data_loader.py
│   │   ├── data_loader.py
│   │   └── image_folder.py
│   ├── models/
│   │   ├── HG_model.py
│   │   ├── __init__.py
│   │   ├── base_model.py
│   │   └── models.py
│   ├── options/
│   │   ├── __init__.py
│   │   ├── base_options.py
│   │   ├── test_options.py
│   │   └── train_options.py
│   ├── pytorch_DIW_scratch.py
│   ├── rmse_error_main.py
│   └── util/
│       ├── __init__.py
│       ├── html.py
│       ├── image_pool.py
│       ├── png.py
│       ├── util.py
│       └── visualizer.py
├── PWCNet/
│   ├── PWCNet.py
│   ├── __init__.py
│   ├── correlation_package_pytorch1_0/
│   │   ├── __init__.py
│   │   ├── build.sh
│   │   ├── clean.sh
│   │   ├── correlation.py
│   │   ├── correlation_cuda.cc
│   │   ├── correlation_cuda_kernel.cu
│   │   ├── correlation_cuda_kernel.cuh
│   │   └── setup.py
│   └── models/
│       ├── PWCNet.py
│       └── __init__.py
├── README.md
├── Resblock/
│   ├── BasicBlock.py
│   └── __init__.py
├── S2D_models/
│   ├── S2DF.py
│   └── __init__.py
├── Stack.py
├── balancedsampler.py
├── colab_interpolate.py
├── datasets/
│   ├── Vimeo_90K_interp.py
│   ├── __init__.py
│   └── listdatasets.py
├── demo_MiddleBury.py
├── demo_MiddleBury_slowmotion.py
├── environment.yaml
├── loss_function.py
├── lr_scheduler.py
├── my_args.py
├── my_package/
│   ├── DepthFlowProjection/
│   │   ├── DepthFlowProjectionLayer.py
│   │   ├── DepthFlowProjectionModule.py
│   │   ├── __init__.py
│   │   ├── depthflowprojection_cuda.cc
│   │   ├── depthflowprojection_cuda_kernel.cu
│   │   ├── depthflowprojection_cuda_kernel.cuh
│   │   └── setup.py
│   ├── FilterInterpolation/
│   │   ├── FilterInterpolationLayer.py
│   │   ├── FilterInterpolationModule.py
│   │   ├── __init__.py
│   │   ├── filterinterpolation_cuda.cc
│   │   ├── filterinterpolation_cuda_kernel.cu
│   │   ├── filterinterpolation_cuda_kernel.cuh
│   │   └── setup.py
│   ├── FlowProjection/
│   │   ├── FlowProjectionLayer.py
│   │   ├── FlowProjectionModule.py
│   │   ├── __init__.py
│   │   ├── flowprojection_cuda.cc
│   │   ├── flowprojection_cuda_kernel.cu
│   │   ├── flowprojection_cuda_kernel.cuh
│   │   └── setup.py
│   ├── Interpolation/
│   │   ├── InterpolationLayer.py
│   │   ├── InterpolationModule.py
│   │   ├── __init__.py
│   │   ├── interpolation_cuda.cc
│   │   ├── interpolation_cuda_kernel.cu
│   │   ├── interpolation_cuda_kernel.cuh
│   │   └── setup.py
│   ├── InterpolationCh/
│   │   ├── InterpolationChLayer.py
│   │   ├── InterpolationChModule.py
│   │   ├── __init__.py
│   │   ├── interpolationch_cuda.cc
│   │   ├── interpolationch_cuda_kernel.cu
│   │   ├── interpolationch_cuda_kernel.cuh
│   │   └── setup.py
│   ├── MinDepthFlowProjection/
│   │   ├── __init__.py
│   │   ├── minDepthFlowProjectionLayer.py
│   │   ├── minDepthFlowProjectionModule.py
│   │   ├── mindepthflowprojection_cuda.cc
│   │   ├── mindepthflowprojection_cuda_kernel.cu
│   │   ├── mindepthflowprojection_cuda_kernel.cuh
│   │   └── setup.py
│   ├── SeparableConv/
│   │   ├── SeparableConvLayer.py
│   │   ├── SeparableConvModule.py
│   │   ├── __init__.py
│   │   ├── separableconv_cuda.cc
│   │   ├── separableconv_cuda_kernel.cu
│   │   ├── separableconv_cuda_kernel.cuh
│   │   └── setup.py
│   ├── SeparableConvFlow/
│   │   ├── SeparableConvFlowLayer.py
│   │   ├── SeparableConvFlowModule.py
│   │   ├── __init__.py
│   │   ├── separableconvflow_cuda.cc
│   │   ├── separableconvflow_cuda_kernel.cu
│   │   ├── separableconvflow_cuda_kernel.cuh
│   │   └── setup.py
│   ├── build.sh
│   ├── clean.sh
│   ├── compiler_args.py
│   └── test_module.py
├── networks/
│   ├── DAIN.py
│   ├── DAIN_slowmotion.py
│   └── __init__.py
└── train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Ignore Git here
.git

# But not these files...
# !.gitignore

checkpoints/test_local/opt.txt
PWCNet/pwc_net.pth.tar
MegaDepth/checkpoints/*
model_weights/*
MiddleBurySet/*

.nfs*

# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# IPython Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# dotenv
.env

# virtualenv
venv/
ENV/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject
### VirtualEnv template
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/workspace.xml
.idea/tasks.xml
.idea/dictionaries
.idea/vcs.xml
.idea/jsLibraryMappings.xml

# Sensitive or high-churn files:
.idea/dataSources.ids
.idea/dataSources.xml
.idea/dataSources.local.xml
.idea/sqlDataSources.xml
.idea/dynamic.xml
.idea/uiDesigner.xml

# Gradle:
.idea/gradle.xml
.idea/libraries

# Mongo Explorer plugin:
.idea/mongoSettings.xml

.idea/

## File-based project format:
*.iws

## Plugin-specific files:

# IntelliJ
/out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties


================================================
FILE: AverageMeter.py
================================================


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


================================================
FILE: Colab_DAIN.ipynb
================================================
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Colab_DAIN_new.ipynb",
      "private_outputs": true,
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1pIo4r_Y8cMo"
      },
      "source": [
        "# DAIN Colab"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iGPHW5SOpPe3"
      },
      "source": [
        "*DAIN Colab, v1.6.0*\n",
        "\n",
        "Based on the [original Colab file](https://github.com/baowenbo/DAIN/issues/44) by btahir. \n",
        "\n",
        "Enhancements by [Styler00Dollar](https://github.com/styler00dollar) aka \"sudo rm -rf / --no-preserve-root#8353\" on discord and [Alpha](https://github.com/AlphaGit), (Alpha#6137 on Discord). Please do not run this command in your linux terminal. It's rather meant as a joke.\n",
        "\n",
        "[Styler00Dollar's fork](https://github.com/styler00dollar/DAIN) / [Alpha's fork](https://github.com/AlphaGit/DAIN)\n",
        "\n",
        "A simple guide:\n",
        "- Upload this ` .ipynb`  file to your Google Colab.\n",
        "- Create a folder inside of Google Drive named \"DAIN\"\n",
        "- Change the configurations in the next cell\n",
        "- Run cells one by one\n",
        "\n",
        "Stuff that should be improved:\n",
        "- Alpha channel will be removed automatically and won't be added back. Anything related to alpha will be converted to black.\n",
        "- Adding configuration to select speed\n",
        "- Detect scenes to avoid interpolating scene-changes\n",
        "- Auto-resume\n",
        "- Copy `start_frame` - `end_frame` audio from original input to final output\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "enKoi0TR2fOD",
        "cellView": "form"
      },
      "source": [
        "################# Required Configurations ############################\n",
        "\n",
        "#@markdown # Required Configuration\n",
        "#@markdown Use the values in here to configure what you'd like DAIN to do.\n",
        "\n",
        "#@markdown ## Input file\n",
        "#@markdown Path (relative to the root of your Google Drive) to the input file. For instance, if you save your `example.mkv` file in your Google Drive, inside a `videos` folder, the path would be: `videos/example.mkv`. Currenly videos and gifs are supported.\n",
        "INPUT_FILEPATH = \"DAIN/input.mp4\" #@param{type:\"string\"}\n",
        "\n",
        "#@markdown ## Output file\n",
        "#@markdown Output file path: path (relative to the root of your Google Drive) for the output file. It will also determine the filetype in the destination. `.mp4` is recommended for video input, `.gif` for gif inputs.\n",
        "OUTPUT_FILE_PATH = \"DAIN/output.mp4\" #@param{type:\"string\"}\n",
        "\n",
        "################# Optional configurations ############################\n",
        "\n",
        "#@markdown # Optional Configuration\n",
        "#@markdown Parameters below can be left with their defaults, but feel free to adapt them to your needs.\n",
        "\n",
        "#@markdown ## Target FPS\n",
        "#@markdown  how many frames per second should the result have. This will determine how many intermediate images are interpolated.\n",
        "TARGET_FPS = 60 #@param{type:\"number\"}\n",
        "\n",
        "#@markdown ## Frame input directory\n",
        "#@markdown A path, relative to your GDrive root, where you already have the list of frames in the format 00001.png, 00002.png, etc.\n",
        "FRAME_INPUT_DIR = '/content/DAIN/input_frames' #@param{type:\"string\"}\n",
        "\n",
        "#@markdown ## Frame output directory\n",
        "#@markdown A path, relative to your GDrive root, where you want the generated frame.\n",
        "FRAME_OUTPUT_DIR = '/content/DAIN/output_frames' #@param{type:\"string\"}\n",
        "\n",
        "#@markdown ## Start Frame\n",
        "#@markdown First frame to consider from the video when processing.\n",
        "START_FRAME = 1 #@param{type:\"number\"}\n",
        "\n",
        "#@markdown ## End Frame\n",
        "#@markdown Last frame to consider from the video when processing. To use the whole video use `-1`.\n",
        "END_FRAME = -1 #@param{type:\"number\"}\n",
        "\n",
        "#@markdown ## Seamless playback\n",
        "#@markdown Creates a seamless loop by using the first frame as last one as well. Set this to True this if loop is intended.\n",
        "SEAMLESS = False #@param{type:\"boolean\"}\n",
        "\n",
        "#@markdown ## Auto-remove PNG directory\n",
        "#@markdown Auto-delete output PNG dir after ffmpeg video creation. Set this to `False` if you want to keep the PNG files.\n",
        "AUTO_REMOVE = True #@param{type:\"boolean\"}"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "N9cGwalNeyk9",
        "cellView": "form"
      },
      "source": [
        "#@title Connect Google Drive\n",
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')\n",
        "print('Google Drive connected.')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "irzjv1x4e3S4",
        "cellView": "form"
      },
      "source": [
        "#@title Check your current GPU\n",
        "# If you are lucky, you get 16GB VRAM. If you are not lucky, you get less. VRAM is important. The more VRAM, the higher the maximum resolution will go.\n",
        "\n",
        "# 16GB: Can handle 720p. 1080p will procude an out-of-memory error. \n",
        "# 8GB: Can handle 480p. 720p will produce an out-of-memory error.\n",
        "\n",
        "!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UYHTTP91oMvh"
      },
      "source": [
        "# Install dependencies.\n",
        "\n",
        "This next step may take somewhere between 15-20 minutes. Run this only once at startup.\n",
        "\n",
        "Look for the \"Finished installing dependencies\"  message."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "e5AHGetTRacZ",
        "cellView": "form"
      },
      "source": [
        "#@title Setup everything. This takes a while. Just wait ~20 minutes in total.\n",
        "\n",
        "# Install old pytorch to avoid faulty output\n",
        "%cd /content/\n",
        "!wget -c https://repo.anaconda.com/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\n",
        "!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh\n",
        "!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local\n",
        "!conda install pytorch==1.1 cudatoolkit torchvision -c pytorch -y\n",
        "!conda install ipykernel -y\n",
        "\n",
        "!pip install scipy==1.1.0\n",
        "!pip install imageio\n",
        "!CUDA_VISIBLE_DEVICES=0\n",
        "!sudo apt-get install imagemagick imagemagick-doc\n",
        "print(\"Finished installing dependencies.\")\n",
        "\n",
        "# Clone DAIN sources\n",
        "%cd /content\n",
        "!git clone -b master --depth 1 https://github.com/baowenbo/DAIN /content/DAIN\n",
        "%cd /content/DAIN\n",
        "!git log -1\n",
        "\n",
        "# Building DAIN\n",
        "%cd /content/DAIN/my_package/\n",
        "!./build.sh\n",
        "print(\"Building #1 done.\")\n",
        "\n",
        "# Building DAIN PyTorch correlation package.\n",
        "%cd /content/DAIN/PWCNet/correlation_package_pytorch1_0\n",
        "!./build.sh\n",
        "print(\"Building #2 done.\")\n",
        "\n",
        "# Downloading pre-trained model\n",
        "%cd /content/DAIN\n",
        "!mkdir model_weights\n",
        "!wget -O model_weights/best.pth http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zm5kn6vTncL4",
        "cellView": "form"
      },
      "source": [
        "#@title Detecting FPS of input file.\n",
        "%shell yes | cp -f /content/gdrive/My\\ Drive/{INPUT_FILEPATH} /content/DAIN/\n",
        "\n",
        "import os\n",
        "filename = os.path.basename(INPUT_FILEPATH)\n",
        "\n",
        "import cv2\n",
        "cap = cv2.VideoCapture(f'/content/DAIN/{filename}')\n",
        "\n",
        "fps = cap.get(cv2.CAP_PROP_FPS)\n",
        "print(f\"Input file has {fps} fps\")\n",
        "\n",
        "if(fps/TARGET_FPS>0.5):\n",
        "  print(\"Define a higher fps, because there is not enough time for new frames. (Old FPS)/(New FPS) should be lower than 0.5. Interpolation will fail if you try.\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9YNva-GuKq4Y",
        "cellView": "form"
      },
      "source": [
        "#@title ffmpeg extract - Generating individual frame PNGs from the source file.\n",
        "%shell rm -rf '{FRAME_INPUT_DIR}'\n",
        "%shell mkdir -p '{FRAME_INPUT_DIR}'\n",
        "\n",
        "if (END_FRAME==-1):\n",
        "  %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=gte(n\\,{START_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n",
        "else:\n",
        "  %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=between(n\\,{START_FRAME}\\,{END_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\n",
        "\n",
        "from IPython.display import clear_output\n",
        "clear_output()\n",
        "\n",
        "png_generated_count_command_result = %shell ls '{FRAME_INPUT_DIR}' | wc -l\n",
        "frame_count = int(png_generated_count_command_result.output.strip())\n",
        "\n",
        "import shutil\n",
        "if SEAMLESS:\n",
        "  frame_count += 1\n",
        "  first_frame = f\"{FRAME_INPUT_DIR}/00001.png\"\n",
        "  new_last_frame = f\"{FRAME_INPUT_DIR}/{frame_count.zfill(5)}.png\"\n",
        "  shutil.copyfile(first_frame, new_last_frame)\n",
        "\n",
        "print(f\"{frame_count} frame PNGs generated.\")\n",
        "\n",
        "#Checking if PNGs do have alpha\n",
        "import subprocess as sp\n",
        "%cd {FRAME_INPUT_DIR}\n",
        "channels = sp.getoutput('identify -format %[channels] 00001.png')\n",
        "print (f\"{channels} detected\")\n",
        "\n",
        "# Removing alpha if detected\n",
        "if \"a\" in channels:\n",
        "  print(\"Alpha channel detected and will be removed.\")\n",
        "  print(sp.getoutput('find . -name \"*.png\" -exec convert \"{}\" -alpha off PNG24:\"{}\" \\;'))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "W3rrE7L824gL",
        "cellView": "form"
      },
      "source": [
        "#@title Interpolation\n",
        "%shell mkdir -p '{FRAME_OUTPUT_DIR}'\n",
        "%cd /content/DAIN\n",
        "\n",
        "!python -W ignore colab_interpolate.py --netName DAIN_slowmotion --time_step {fps/TARGET_FPS} --start_frame 1 --end_frame {frame_count} --frame_input_dir '{FRAME_INPUT_DIR}' --frame_output_dir '{FRAME_OUTPUT_DIR}'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TKREDli2IDMV",
        "cellView": "form"
      },
      "source": [
        "#@title Create output video\n",
        "%cd {FRAME_OUTPUT_DIR}\n",
        "%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n",
        "\n",
        "if(AUTO_REMOVE):\n",
        "  !rm -rf {FRAME_OUTPUT_DIR}/*\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UF5TEo5N374o",
        "cellView": "form"
      },
      "source": [
        "#@title [Experimental] Create video with sound\n",
        "# Only run this, if the original had sound.\n",
        "%cd {FRAME_OUTPUT_DIR}\n",
        "%shell ffmpeg -i '/content/DAIN/{filename}' -acodec copy output-audio.aac\n",
        "%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' -i output-audio.aac -shortest '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\n",
        "\n",
        "if (AUTO_REMOVE):\n",
        "  !rm -rf {FRAME_OUTPUT_DIR}/*\n",
        "  !rm -rf output-audio.aac"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2019 Wenbo Bao

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MegaDepth/LICENSE
================================================
MIT License

Copyright (c) 2018 Zhengqi Li

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MegaDepth/MegaDepth_model.py
================================================
import torch
import sys
from torch.autograd import Variable
import numpy as np
from .options.train_options import TrainOptions
from .models.models import create_model
__all__ = ['HourGlass']


def HourGlass(pretrained=None):
    """Constructs a ResNet-18 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """

    opt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch
    model = create_model(opt,pretrained)
    #netG is the real nn.Module
    return model.netG


================================================
FILE: MegaDepth/README.md
================================================
# MegaDepth: Learning Single-View Depth Prediction from Internet Photos

This is a code of the algorithm described in "MegaDepth: Learning Single-View Depth Prediction from Internet Photos, Z. Li and N. Snavely, CVPR 2018". The code skeleton is based on "https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix". If you use our code or models for academic purposes, please consider citing:

    @inproceedings{MDLi18,
	  	title={MegaDepth: Learning Single-View Depth Prediction from Internet Photos},
	  	author={Zhengqi Li and Noah Snavely},
	  	booktitle={Computer Vision and Pattern Recognition (CVPR)},
	  	year={2018}
	}

#### Examples of single-view depth predictions on the photos we randomly downloaded from Internet:
<img src="https://github.com/lixx2938/MegaDepth/blob/master/demo.jpg" width="300"/> <img src="https://github.com/lixx2938/MegaDepth/blob/master/demo.png" width="300"/>
<img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_2.jpg" width="300"/> <img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_2.png" width="300"/>
<img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_3.jpg" width="300"/> <img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_3.png" width="300"/>
<img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_4.jpg" width="300"/> <img src="https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_4.png" width="300"/>

#### Dependencies:
* The code was written in Pytorch 0.2 and Python 2.7, but it should be easy to adapt it to Python 3 and latest Pytorch version if needed.
* You might need skimage, h5py libraries installed for python before running the code.

#### Single-view depth prediction on any Internet photo:
* Download pretrained model from: http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth and put it in "checkpoints/test_local/best_generalization_net_G.pth
* In python file "models/HG_model.py", in init function, change to "model_parameters = self.load_network(model, 'G', 'best_generalization')"
* run demo code 
```bash
    python demo.py
```
You should see an inverse depth prediction saved as demo.png from an original photo demo.jpg. If you want to use RGB maps for visualization, like the figures in our paper, you have to install/run semantic segmentation from https://github.com/kazuto1011/pspnet-pytorch trained on ADE20K to mask out sky, because inconsistent depth prediction of unmasked sky will not make RGB visualization resonable.


#### Evaluation on the MegaDepth test splits:
* Download MegaDepth V1 dataset from project website: http://www.cs.cornell.edu/projects/megadepth/.
* Download pretrained model (specific for MD dataset) from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_vanila_net_G.pth and put it in "checkpoints/test_local/best_vanila_net_G.pth" 
* Download test list files from http://www.cs.cornell.edu/projects/megadepth/dataset/data_lists/test_lists.tar.gz, it should include two folders corresponding to images with landscape and portrait orientations.
* To compute scale invarance RMSE on MD testset, change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run:
```bash
    python rmse_error_main.py
```
* To compute Structure from Motion Disagreement Rate (SDR), change the variable "dataset_root" in python file "rmse_error_main.py" to the root directory of MegaDepth_v1 folder, and change variable "test_list_dir_l" and "test_list_dir_p" to corresponding folder paths of test lists, and run:
```bash
    python SDR_compute.py
```
* If you want to run our model on arbitrary Internet photos, please download pretrained model from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth, which has much better generalization ability (qualitatively speaking) to completely unknown scenes.


================================================
FILE: MegaDepth/SDR_compute.py
================================================
import time
import torch
import sys

from options.train_options import TrainOptions
opt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch
from data.data_loader import CreateDataLoader_TEST
from models.models import create_model

dataset_root = "/phoenix/S6/zl548/"
test_list_dir_l = dataset_root + '/MegaDpeth_code/test_list/landscape/'
input_height = 240
input_width = 320
test_data_loader_l = CreateDataLoader_TEST(dataset_root, test_list_dir_l, input_height, input_width)
test_dataset_l = test_data_loader_l.load_data()
test_dataset_size_l = len(test_data_loader_l)
print('========================= test L images = %d' % test_dataset_size_l)

test_list_dir_p = dataset_root + '/MegaDpeth_code/test_list/portrait/'
input_height = 320
input_width = 240
test_data_loader_p = CreateDataLoader_TEST(dataset_root, test_list_dir_p, input_height, input_width)
test_dataset_p = test_data_loader_p.load_data()
test_dataset_size_p = len(test_data_loader_p)
print('========================= test P images = %d' % test_dataset_size_p)

model = create_model(opt)

batch_size = 32
diw_index = 0 
total_steps = 0
best_loss = 100

error_list = [0 , 0, 0]
total_list = [0 , 0, 0]

list_l = range(test_dataset_size_l)
list_p = range(test_dataset_size_p)


def test_SDR(model):
    total_loss =0 
    # count = 0
    print("============================= TEST SDR============================")
    model.switch_to_eval()
    diw_index = 0

    for i, data in enumerate(test_dataset_l):
        stacked_img = data['img_1']
        targets = data['target_1']    
        error, samples = model.evaluate_SDR(stacked_img, targets)

        for j in range(0,3):
            error_list[j] += error[j]
            total_list[j] += samples[j]

        print("EQUAL  ", error_list[0]/float(total_list[0]))
        print("INEQUAL    ", error_list[1]/float(total_list[1]))
        print("TOTAL    ",error_list[2]/float(total_list[2]))

    for i, data in enumerate(test_dataset_p):
        stacked_img = data['img_1']
        targets = data['target_1']    

        error, samples = model.evaluate_SDR(stacked_img, targets)

        for j in range(0,3):
            error_list[j] += error[j]
            total_list[j] += samples[j]

        print("EQUAL  ", error_list[0]/float(total_list[0]))
        print("INEQUAL    ", error_list[1]/float(total_list[1]))
        print("TOTAL    ",error_list[2]/float(total_list[2]))


    print("=========================================================SDR Summary =====================")
    print("Equal SDR:\t" , float(error_list[0])/ float(total_list[0]))
    print("Unequal SDR:\t" , float(error_list[1])/ float(total_list[1]))
    print("SDR:\t" , float(error_list[2])/ float(total_list[2]))


print("WE ARE TESTING SDR!!!!")
test_SDR(model)


================================================
FILE: MegaDepth/__init__.py
================================================
from .MegaDepth_model import    *


================================================
FILE: MegaDepth/data/__init__.py
================================================


================================================
FILE: MegaDepth/data/aligned_data_loader.py
================================================
import random
import numpy as np
import torch.utils.data
from data.base_data_loader import BaseDataLoader
from data.image_folder import ImageFolder
from data.image_folder import ImageFolder_TEST
from builtins import object
import sys
import h5py


class PairedData(object):
    def __init__(self, data_loader, flip):
        self.data_loader = data_loader
        # self.fineSize = fineSize
        # self.max_dataset_size = max_dataset_size
        self.flip = flip
        self.data_loader_iter = iter(self.data_loader)
        self.iter = 0
    

    def __iter__(self):
        self.data_loader_iter = iter(self.data_loader)
        self.iter = 0
        return self

    def __next__(self):
        self.iter += 1

        final_img, target_1 = next(self.data_loader_iter)

        return {'img_1': final_img, 'target_1': target_1}


class AlignedDataLoader(BaseDataLoader):
    def __init__(self,_root, _list_dir, _input_height, _input_width, _is_flip, _shuffle):
        transform = None
        dataset = ImageFolder(root=_root, \
                list_dir =_list_dir, input_height = _input_height, input_width = _input_width, transform=transform, is_flip = _is_flip)

        data_loader = torch.utils.data.DataLoader(dataset, batch_size= 16, shuffle= _shuffle, num_workers=int(3))

        self.dataset = dataset
        flip = False
        self.paired_data = PairedData(data_loader, flip)

    def name(self):
        return 'RMSEDataLoader'

    def load_data(self):
        return self.paired_data

    def __len__(self):
        return len(self.dataset)


class AlignedDataLoader_TEST(BaseDataLoader):
    def __init__(self,_root, _list_dir, _input_height, _input_width):

        dataset = ImageFolder_TEST(root=_root, \
                list_dir =_list_dir, _input_height = _input_height, _input_width = _input_width)

        data_loader = torch.utils.data.DataLoader(dataset, batch_size= 1, shuffle= False, num_workers=int(3))
        self.dataset = dataset
        flip = False
        self.paired_data = PairedData(data_loader, flip)

    def name(self):
        return 'TestSDRDataLoader'

    def load_data(self):
        return self.paired_data


    def __len__(self):
        return len(self.dataset)


================================================
FILE: MegaDepth/data/base_data_loader.py
================================================

class BaseDataLoader():
    def __init__(self):
        pass
    
    # def initialize(self):
    #     # self.opt = opt
    #     pass

    def load_data():
        return None

        
================================================
FILE: MegaDepth/data/data_loader.py
================================================

def CreateDataLoader(_root, _list_dir, _input_height, _input_width, is_flip = True, shuffle =  True):
    data_loader = None
    from data.aligned_data_loader import AlignedDataLoader
    data_loader = AlignedDataLoader(_root, _list_dir, _input_height, _input_width, is_flip, shuffle)
    return data_loader

def CreateDataLoader_TEST(_root, _list_dir, _input_height, _input_width):
    data_loader = None
    from data.aligned_data_loader import AlignedDataLoader_TEST
    data_loader = AlignedDataLoader_TEST(_root, _list_dir, _input_height, _input_width)

    return data_loader


================================================
FILE: MegaDepth/data/image_folder.py
================================================
################################################################################
# Code from
# https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py
# Modified the original code so that it also loads images from the current
# directory as well as the subdirectories
################################################################################
import h5py
import torch.utils.data as data
import pickle
import numpy as np
import torch
import os, os.path
import math, random
import sys
from skimage.transform import resize
from skimage import io


def make_dataset(list_dir):
    # subgroup_name1 = "/dataset/image_list/"
    file_name = list_dir + "imgs_MD.p"
    file_name_1 = open( file_name, "rb" )
    images_list = pickle.load( file_name_1)
    file_name_1.close()

    file_name_t= list_dir + "targets_MD.p"
    file_name_2 = open( file_name_t, "rb" )
    targets_list = pickle.load(file_name_2)
    file_name_2.close()
    return images_list, targets_list

# test for si-RMSE
class ImageFolder(data.Dataset):

    def __init__(self, root, list_dir, input_height, input_width, transform=None, 
                 loader=None, is_flip = True):
        # load image list from hdf5
        img_list , targets_list = make_dataset(list_dir)
        if len(img_list) == 0:
            raise(RuntimeError("Found 0 images in: " + root + "\n"
                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
        # img_list_1, img_list_2 = selfshuffle_dataset(img_list)
        self.root = root
        self.list_dir = list_dir
        self.img_list = img_list
        self.targets_list = targets_list
        self.transform = transform
        # self.loader = loader
        self.input_height = input_height
        self.input_width = input_width
        self.is_flip = is_flip


    def load_MD(self, img_path, depth_path):

        MD_img = np.float32(io.imread(img_path))/255.0

        hdf5_file_read = h5py.File(depth_path,'r')
        gt = hdf5_file_read.get('/depth')
        gt = np.array(gt)

        assert(gt.shape[0] == MD_img.shape[0])
        assert(gt.shape[1] == MD_img.shape[1])

        color_rgb = np.zeros((self.input_height,self.input_width,3))
        MD_img = resize(MD_img, (self.input_height, self.input_width), order = 1)

        if len(MD_img.shape) == 2:
            color_rgb[:,:,0] = MD_img.copy()
            color_rgb[:,:,1] = MD_img.copy()
            color_rgb[:,:,2] = MD_img.copy()
        else:
            color_rgb = MD_img.copy()

        if np.sum(gt > 1e-8) > 10:
            gt[ gt > np.percentile(gt[gt > 1e-8], 98)] = 0
            gt[ gt < np.percentile(gt[gt > 1e-8], 1)] = 0

        max_depth = np.max(gt) + 1e-9
        gt = gt/max_depth
        gt = resize(gt, (self.input_height, self.input_width), order = 0)
        gt = gt*max_depth

        mask = np.float32(gt > 1e-8)

        color_rgb = np.ascontiguousarray(color_rgb)
        gt = np.ascontiguousarray(gt)
        mask = np.ascontiguousarray(mask)

        hdf5_file_read.close()

        return color_rgb, gt, mask

    def __getitem__(self, index):
        # 00xx/1/
        targets_1 = {}
        # targets_1['L'] = []
        targets_1['path'] = []

        img_path_suff = self.img_list[index]
        targets_path_suff = self.targets_list[index]

        img_path = self.root + "/MegaDepth_v1/" + img_path_suff
        depth_path = self.root + "/MegaDepth_v1/" + targets_path_suff

        img, gt, mask = self.load_MD(img_path, depth_path)
        
        gt[mask < 0.1] = 1.0

        targets_1['path'] = targets_path_suff
        targets_1['gt_0'] = torch.from_numpy(gt).float()
        targets_1['mask_0'] = torch.from_numpy(mask).float()

        final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()

        return final_img, targets_1

    def __len__(self):
        return len(self.img_list)


#  Test for SDR 
class ImageFolder_TEST(data.Dataset):

    def __init__(self, root, list_dir, _input_height, _input_width):
        # load image list from hdf5
        img_list , targets_list = make_dataset(list_dir)
        if len(img_list) == 0:
            raise(RuntimeError("Found 0 images in: " + root + "\n"
                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
        self.root = root
        self.list_dir = list_dir
        self.img_list = img_list
        self.input_height = _input_height
        self.input_width = _input_width
        self.half_window = 1

    def load_SfM_ORD(self, img_path, targets_path):

        sfm_image = np.float32(io.imread(img_path))/255.0
        resized_sfm_img = resize(sfm_image, (self.input_height, self.input_width), order = 1)

        color_rgb = np.zeros((self.input_height, self.input_width,3))

        if len(sfm_image.shape) == 2:
            color_rgb[:,:,0] = resized_sfm_img.copy()
            color_rgb[:,:,1] = resized_sfm_img.copy()
            color_rgb[:,:,2] = resized_sfm_img.copy()
        else:
            color_rgb = resized_sfm_img.copy()

        if color_rgb.shape[2] == 4:
            return color_rgb, 0, 0 ,0, 0, 0

        hdf5_file_read = h5py.File(targets_path,'r')
        gt = hdf5_file_read.get('/SfM_features')
        gt = np.array(gt)

        y_A = np.round( gt[0,:] * float(self.input_height) )
        x_A = np.round( gt[1,:] * float(self.input_width) )
        y_B = np.round( gt[2,:] * float(self.input_height) )
        x_B = np.round( gt[3,:] * float(self.input_width) )
        ord_ = gt[4,:]

        hdf5_file_read.close()

        return color_rgb, y_A, x_A ,y_B, x_B, ord_

    def __getitem__(self, index):
        # 00xx/1/
        targets_1 = {}
        # targets_1['L'] = []
        targets_1['path'] = []
        targets_1['sdr_xA'] = []
        targets_1['sdr_yA'] = []
        targets_1['sdr_xB'] = []
        targets_1['sdr_yB'] = []
        targets_1['sdr_gt'] = []

        img_path_suff = self.img_list[index]
        img_path = self.root + "/MegaDepth_v1/" + img_path_suff
        folder_name = img_path_suff.split('/')[-4]
        img_name = img_path_suff.split('/')[-1]
        sparse_sift_path = self.root + "/sparse_features/" + folder_name + "/" + img_name + ".h5"

        # no sift features
        if not os.path.isfile(sparse_sift_path) or not os.path.isfile(img_path):

            img = np.zeros((self.input_height, self.input_width,3))
            targets_1['has_SfM_feature'] = False

        else:

            img, y_A, x_A ,y_B, x_B, ordinal = self.load_SfM_ORD(img_path, sparse_sift_path)

            targets_1['sdr_xA'].append(torch.from_numpy(x_A).long())
            targets_1['sdr_yA'].append(torch.from_numpy(y_A).long())
            targets_1['sdr_xB'].append(torch.from_numpy(x_B).long())
            targets_1['sdr_yB'].append(torch.from_numpy(y_B).long())
            targets_1['sdr_gt'].append(torch.from_numpy(ordinal).float())
            targets_1['has_SfM_feature'] = True

        final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()


        return final_img, targets_1


    def __len__(self):
        return len(self.img_list)


================================================
FILE: MegaDepth/models/HG_model.py
================================================
import numpy as np
import torch
import os
from torch.autograd import Variable
from .base_model import BaseModel
import sys
# import pytorch_DIW_scratch
import MegaDepth.pytorch_DIW_scratch as pytorch_DIW_scratch

class HGModel(BaseModel):
    def name(self):
        return 'HGModel'

    def __init__(self, opt,pretrained=None):
        BaseModel.initialize(self, opt)

        # print("===========================================LOADING Hourglass NETWORK====================================================")
        model = pytorch_DIW_scratch.pytorch_DIW_scratch
        # model_temp = model
        # model= torch.nn.parallel.DataParallel(model, device_ids = [0,1])
        # model_parameters = self.load_network(model, 'G', 'best_vanila')
        if pretrained is None:
            # model_parameters = self.load_network(model, 'G', 'best_generalization')
            #
            # model.load_state_dict(model_parameters)
            # self.netG = model.cuda()
            self.netG    = model
            # print("No weights loaded for Hourglass Network")
        else:
            pretrained_dict = torch.load(pretrained)

            model_dict = model.state_dict()
            # print(len(pretrained_dict))
            # print(len(model_dict))
            # 1. filter out unnecessary keys
            # the saved model contains a 'module.' prefix for the data.parallel reason
            pretrained_dict = {k[7:]: v for k, v in pretrained_dict.items()}  # and not k[:10]== 'rectifyNet'}
            # print(str(len(pretrained_dict)) + " are updated")
            # 2. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)
            # 3. load the new state dict
            model.load_state_dict(model_dict)
            pretrained_dict = None
            self.netG = model


    def batch_classify(self, z_A_arr, z_B_arr, ground_truth ):
        threashold = 1.1
        depth_ratio = torch.div(z_A_arr, z_B_arr)

        depth_ratio = depth_ratio.cpu()

        estimated_labels = torch.zeros(depth_ratio.size(0))

        estimated_labels[depth_ratio > (threashold)] = 1
        estimated_labels[depth_ratio < (1/threashold)] = -1

        diff = estimated_labels - ground_truth
        diff[diff != 0] = 1

        # error 
        inequal_error_count = diff[ground_truth != 0]
        inequal_error_count =  torch.sum(inequal_error_count)

        error_count = torch.sum(diff) #diff[diff !=0]
        # error_count = error_count.size(0)

        equal_error_count = error_count - inequal_error_count


        # total 
        total_count = depth_ratio.size(0)
        ground_truth[ground_truth !=0 ] = 1

        inequal_count_total = torch.sum(ground_truth)
        equal_total_count = total_count - inequal_count_total


        error_list = [equal_error_count, inequal_error_count, error_count]
        count_list = [equal_total_count, inequal_count_total, total_count]

        return error_list, count_list 


    def computeSDR(self, prediction_d, targets):
        #  for each image 
        total_error = [0,0,0]
        total_samples = [0,0,0]

        for i in range(0, prediction_d.size(0)):

            if targets['has_SfM_feature'][i] == False:
                continue
            
            x_A_arr = targets["sdr_xA"][i].squeeze(0)
            x_B_arr = targets["sdr_xB"][i].squeeze(0)
            y_A_arr = targets["sdr_yA"][i].squeeze(0)
            y_B_arr = targets["sdr_yB"][i].squeeze(0)

            predict_depth = torch.exp(prediction_d[i,:,:])
            predict_depth = predict_depth.squeeze(0)
            ground_truth = targets["sdr_gt"][i]

            # print(x_A_arr.size())
            # print(y_A_arr.size())

            z_A_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_A_arr.cuda()) , 0, y_A_arr.view(1, -1).cuda())# predict_depth:index(2, x_A_arr):gather(1, y_A_arr:view(1, -1))
            z_B_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_B_arr.cuda()) , 0, y_B_arr.view(1, -1).cuda())

            z_A_arr = z_A_arr.squeeze(0)
            z_B_arr = z_B_arr.squeeze(0)

            error_list, count_list  = self.batch_classify(z_A_arr, z_B_arr,ground_truth)

            for j in range(0,3):
                total_error[j] += error_list[j]
                total_samples[j] += count_list[j]

        return  total_error, total_samples


    def evaluate_SDR(self, input_, targets):
        input_images = Variable(input_.cuda() )
        prediction_d = self.netG.forward(input_images) 

        total_error, total_samples = self.computeSDR(prediction_d.data, targets)

        return total_error, total_samples

    def rmse_Loss(self, log_prediction_d, mask, log_gt):
        N = torch.sum(mask)
        log_d_diff = log_prediction_d - log_gt
        log_d_diff = torch.mul(log_d_diff, mask)
        s1 = torch.sum( torch.pow(log_d_diff,2) )/N 

        s2 = torch.pow(torch.sum(log_d_diff),2)/(N*N)  
        data_loss = s1 - s2

        data_loss = torch.sqrt(data_loss)

        return data_loss

    def evaluate_RMSE(self, input_images, prediction_d, targets):
        count = 0            
        total_loss = Variable(torch.cuda.FloatTensor(1))
        total_loss[0] = 0
        mask_0 = Variable(targets['mask_0'].cuda(), requires_grad = False)
        d_gt_0 = torch.log(Variable(targets['gt_0'].cuda(), requires_grad = False))

        for i in range(0, mask_0.size(0)):
 
            total_loss +=  self.rmse_Loss(prediction_d[i,:,:], mask_0[i,:,:], d_gt_0[i,:,:])
            count += 1

        return total_loss.data[0], count


    def evaluate_sc_inv(self, input_, targets):
        input_images = Variable(input_.cuda() )
        prediction_d = self.netG.forward(input_images) 
        rmse_loss , count= self.evaluate_RMSE(input_images, prediction_d, targets)

        return rmse_loss, count


    def switch_to_train(self):
        self.netG.train()

    def switch_to_eval(self):
        self.netG.eval()


================================================
FILE: MegaDepth/models/__init__.py
================================================


================================================
FILE: MegaDepth/models/base_model.py
================================================
import os
import torch

class BaseModel():
    def name(self):
        return 'BaseModel'

    def initialize(self, opt):
        self.opt = opt
        self.gpu_ids = opt.gpu_ids
        self.isTrain = opt.isTrain
        self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor
        self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)

    def set_input(self, input):
        self.input = input

    def forward(self):
        pass

    # used in test time, no backprop
    def test(self):
        pass

    def get_image_paths(self):
        pass

    def optimize_parameters(self):
        pass

    def get_current_visuals(self):
        return self.input

    def get_current_errors(self):
        return {}

    def save(self, label):
        pass

    # helper saving function that can be used by subclasses
    def save_network(self, network, network_label, epoch_label, gpu_ids):
        save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)
        save_path = os.path.join(self.save_dir, save_filename)
        torch.save(network.cpu().state_dict(), save_path)
        if len(gpu_ids) and torch.cuda.is_available():
            network.cuda(device_id=gpu_ids[0])

    # helper loading function that can be used by subclasses
    def load_network(self, network, network_label, epoch_label):
        save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
        save_path = os.path.join(self.save_dir, save_filename)
        print(save_path)
        model = torch.load(save_path)
        return model
        # network.load_state_dict(torch.load(save_path))

    def update_learning_rate():
        pass


================================================
FILE: MegaDepth/models/models.py
================================================

def create_model(opt,pretrained=None):
    model = None
    from .HG_model import HGModel
    model = HGModel(opt,pretrained)
    # print("model [%s] was created" % (model.name()))
    return model


================================================
FILE: MegaDepth/options/__init__.py
================================================


================================================
FILE: MegaDepth/options/base_options.py
================================================
import argparse
import os
from ..util import util

class BaseOptions():
    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.initialized = False

    def initialize(self):
        # self.parser.add_argument('--dataroot', required=True, help='path to images (should have subfolders trainA, trainB, valA, valB, etc)')
        self.parser.add_argument('--batchSize', type=int, default=1, help='input batch size')
        self.parser.add_argument('--loadSize', type=int, default=286, help='scale images to this size')
        self.parser.add_argument('--fineSize', type=int, default=256, help='then crop to this size')
        self.parser.add_argument('--input_nc', type=int, default=3, help='# of input image channels')
        self.parser.add_argument('--output_nc', type=int, default=3, help='# of output image channels')
        self.parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in first conv layer')
        self.parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in first conv layer')
        # self.parser.add_argument('--which_model_netD', type=str, default='basic', help='selects model to use for netD')
        self.parser.add_argument('--which_model_netG', type=str, default='unet_256', help='selects model to use for netG')
        # self.parser.add_argument('--n_layers_D', type=int, default=3, help='only used if which_model_netD==n_layers')
        self.parser.add_argument('--gpu_ids', type=str, default='0,1', help='gpu ids: e.g. 0  0,1,2, 0,2')
        self.parser.add_argument('--name', type=str, default='test_local', help='name of the experiment. It decides where to store samples and models')
        # self.parser.add_argument('--align_data', action='store_true',
                                # help='if True, the datasets are loaded from "test" and "train" directories and the data pairs are aligned')
        self.parser.add_argument('--model', type=str, default='pix2pix',
                                 help='chooses which model to use. cycle_gan, one_direction_test, pix2pix, ...')
        # self.parser.add_argument('--which_direction', type=str, default='AtoB', help='AtoB or BtoA')
        self.parser.add_argument('--nThreads', default=2, type=int, help='# threads for loading data')
        self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints/', help='models are saved here')
        self.parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization')
        self.parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
        self.parser.add_argument('--display_winsize', type=int, default=256,  help='display window size')
        self.parser.add_argument('--display_id', type=int, default=1, help='window id of the web display')
        self.parser.add_argument('--identity', type=float, default=0.0, help='use identity mapping. Setting identity other than 1 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set optidentity = 0.1')
        self.parser.add_argument('--use_dropout', action='store_true', help='use dropout for the generator')
        self.parser.add_argument('--max_dataset_size', type=int, default=float("inf"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')

        self.initialized = True

    def parse(self):
        if not self.initialized:
            self.initialize()
        self.opt = self.parser.parse_known_args()[0] #parse_args()
        self.opt.isTrain = self.isTrain   # train or test

        str_ids = self.opt.gpu_ids.split(',')
        self.opt.gpu_ids = []
        for str_id in str_ids:
            id = int(str_id)
            if id >= 0:
                self.opt.gpu_ids.append(id)

        args = vars(self.opt)

        # print('------------ Options -------------')
        # for k, v in sorted(args.items()):
        #     print('%s: %s' % (str(k), str(v)))
        # print('-------------- End ----------------')

        # save to the disk
        expr_dir =  os.path.join(self.opt.checkpoints_dir, self.opt.name)
        util.mkdirs(expr_dir)
        file_name = os.path.join(expr_dir, 'opt.txt')
        with open(file_name, 'wt') as opt_file:
            opt_file.write('------------ Options -------------\n')
            for k, v in sorted(args.items()):
                opt_file.write('%s: %s\n' % (str(k), str(v)))
            opt_file.write('-------------- End ----------------\n')
        return self.opt


================================================
FILE: MegaDepth/options/test_options.py
================================================
from .base_options import BaseOptions

class TestOptions(BaseOptions):
    def initialize(self):
        BaseOptions.initialize(self)
        self.parser.add_argument('--ntest', type=int, default=float("inf"), help='# of test examples.')
        self.parser.add_argument('--results_dir', type=str, default='./results/', help='saves results here.')
        self.parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')
        self.parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
        self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
        self.parser.add_argument('--how_many', type=int, default=50, help='how many test images to run')
        self.isTrain = False


================================================
FILE: MegaDepth/options/train_options.py
================================================
from .base_options import BaseOptions

class TrainOptions(BaseOptions):
    def initialize(self):
        BaseOptions.initialize(self)
        self.parser.add_argument('--display_freq', type=int, default=100, help='frequency of showing training results on screen')
        self.parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')
        self.parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')
        self.parser.add_argument('--save_epoch_freq', type=int, default=5, help='frequency of saving checkpoints at the end of epochs')
        self.parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')
        self.parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')
        self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
        self.parser.add_argument('--niter', type=int, default=100, help='# of iter at starting learning rate')
        self.parser.add_argument('--niter_decay', type=int, default=100, help='# of iter to linearly decay learning rate to zero')
        self.parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam')
        self.parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate for adam')
        self.parser.add_argument('--no_lsgan', action='store_true', help='do *not* use least square GAN, if false, use vanilla GAN')
        self.parser.add_argument('--lambda_A', type=float, default=10.0, help='weight for cycle loss (A -> B -> A)')
        self.parser.add_argument('--lambda_B', type=float, default=10.0, help='weight for cycle loss (B -> A -> B)')
        self.parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images')
        self.parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/')
        self.parser.add_argument('--no_flip'  , action='store_true', help='if specified, do not flip the images for data argumentation')

        # NOT-IMPLEMENTED self.parser.add_argument('--preprocessing', type=str, default='resize_and_crop', help='resizing/cropping strategy')
        self.isTrain = True


================================================
FILE: MegaDepth/pytorch_DIW_scratch.py
================================================

import torch
import torch.nn as nn
from torch.autograd import Variable
from functools import reduce

class LambdaBase(nn.Sequential):
    def __init__(self, fn, *args):
        super(LambdaBase, self).__init__(*args)
        self.lambda_func = fn

    def forward_prepare(self, input):
        output = []
        for module in self._modules.values():
            output.append(module(input))
        return output if output else input

class Lambda(LambdaBase):
    def forward(self, input):
        return self.lambda_func(self.forward_prepare(input))

class LambdaMap(LambdaBase):
    def forward(self, input):
        return list(map(self.lambda_func,self.forward_prepare(input)))

class LambdaReduce(LambdaBase):
    def forward(self, input):
        return reduce(self.lambda_func,self.forward_prepare(input))


pytorch_DIW_scratch = nn.Sequential( # Sequential,
	nn.Conv2d(3,128,(7, 7),(1, 1),(3, 3)),
	nn.BatchNorm2d(128),
	nn.ReLU(),
	nn.Sequential( # Sequential,
		LambdaMap(lambda x: x, # ConcatTable,
			nn.Sequential( # Sequential,
				nn.MaxPool2d((2, 2),(2, 2)),
				LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
				),
				LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
				),
				nn.Sequential( # Sequential,
					LambdaMap(lambda x: x, # ConcatTable,
						nn.Sequential( # Sequential,
							nn.MaxPool2d((2, 2),(2, 2)),
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(128,64,(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
							nn.Sequential( # Sequential,
								LambdaMap(lambda x: x, # ConcatTable,
									nn.Sequential( # Sequential,
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
									),
									nn.Sequential( # Sequential,
										nn.AvgPool2d((2, 2),(2, 2)),
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
										nn.Sequential( # Sequential,
											LambdaMap(lambda x: x, # ConcatTable,
												nn.Sequential( # Sequential,
													LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
														nn.Sequential( # Sequential,
															nn.Conv2d(256,64,(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
													),
													LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
														nn.Sequential( # Sequential,
															nn.Conv2d(256,64,(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
													),
												),
												nn.Sequential( # Sequential,
													nn.AvgPool2d((2, 2),(2, 2)),
													LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
														nn.Sequential( # Sequential,
															nn.Conv2d(256,64,(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
													),
													LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
														nn.Sequential( # Sequential,
															nn.Conv2d(256,64,(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
													),
													LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
														nn.Sequential( # Sequential,
															nn.Conv2d(256,64,(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
														nn.Sequential( # Sequential,
															nn.Conv2d(256,32,(1, 1)),
															nn.BatchNorm2d(32,1e-05,0.1,False),
															nn.ReLU(),
															nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
															nn.BatchNorm2d(64,1e-05,0.1,False),
															nn.ReLU(),
														),
													),
													nn.UpsamplingNearest2d(scale_factor=2),
												),
											),
											LambdaReduce(lambda x,y: x+y), # CAddTable,
										),
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,32,(1, 1)),
												nn.BatchNorm2d(32,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
										LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
											nn.Sequential( # Sequential,
												nn.Conv2d(256,64,(1, 1)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
												nn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),
												nn.BatchNorm2d(64,1e-05,0.1,False),
												nn.ReLU(),
											),
										),
										nn.UpsamplingNearest2d(scale_factor=2),
									),
								),
								LambdaReduce(lambda x,y: x+y), # CAddTable,
							),
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(256,64,(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(256,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
							nn.UpsamplingNearest2d(scale_factor=2),
						),
						nn.Sequential( # Sequential,
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
							LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
								nn.Sequential( # Sequential,
									nn.Conv2d(128,32,(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,64,(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,64,(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
								nn.Sequential( # Sequential,
									nn.Conv2d(128,64,(1, 1)),
									nn.BatchNorm2d(64,1e-05,0.1,False),
									nn.ReLU(),
									nn.Conv2d(64,32,(11, 11),(1, 1),(5, 5)),
									nn.BatchNorm2d(32,1e-05,0.1,False),
									nn.ReLU(),
								),
							),
						),
					),
					LambdaReduce(lambda x,y: x+y), # CAddTable,
				),
				LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,32,(5, 5),(1, 1),(2, 2)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
					),
				),
				LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
					nn.Sequential( # Sequential,
						nn.Conv2d(128,16,(1, 1)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,16,(3, 3),(1, 1),(1, 1)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,16,(7, 7),(1, 1),(3, 3)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,32,(1, 1)),
						nn.BatchNorm2d(32,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(32,16,(11, 11),(1, 1),(5, 5)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
				),
				nn.UpsamplingNearest2d(scale_factor=2),
			),
			nn.Sequential( # Sequential,
				LambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,
					nn.Sequential( # Sequential,
						nn.Conv2d(128,16,(1, 1)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,16,(3, 3),(1, 1),(1, 1)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,16,(7, 7),(1, 1),(3, 3)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
					nn.Sequential( # Sequential,
						nn.Conv2d(128,64,(1, 1)),
						nn.BatchNorm2d(64,1e-05,0.1,False),
						nn.ReLU(),
						nn.Conv2d(64,16,(11, 11),(1, 1),(5, 5)),
						nn.BatchNorm2d(16,1e-05,0.1,False),
						nn.ReLU(),
					),
				),
			),
		),
		LambdaReduce(lambda x,y: x+y), # CAddTable,
	),
	nn.Conv2d(64,1,(3, 3),(1, 1),(1, 1)),
)

================================================
FILE: MegaDepth/rmse_error_main.py
================================================
import time
import torch
import sys

from options.train_options import TrainOptions
opt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch
from data.data_loader import CreateDataLoader
from models.models import create_model

dataset_root = "/phoenix/S6/zl548/"
test_list_dir_l = '/phoenix/S6/zl548/MegaDpeth_code/test_list/landscape/'
input_height = 240
input_width = 320
is_flipped = False
shuffle = False

test_data_loader_l = CreateDataLoader(dataset_root, test_list_dir_l, input_height, input_width, is_flipped, shuffle)
test_dataset_l = test_data_loader_l.load_data()
test_dataset_size_l = len(test_data_loader_l)
print('========================= test images = %d' % test_dataset_size_l)
test_list_dir_p = '/phoenix/S6/zl548/MegaDpeth_code/test_list/portrait/'
input_height = 320
input_width = 240
test_data_loader_p = CreateDataLoader(dataset_root, test_list_dir_p, input_height, input_width, is_flipped, shuffle)
test_dataset_p = test_data_loader_p.load_data()
test_dataset_size_p = len(test_data_loader_p)
print('========================= test images = %d' % test_dataset_size_p)


model = create_model(opt)


def test(model):
    total_loss =0 
    toal_count = 0
    print("============================= TEST ============================")
    model.switch_to_eval()
    for i, data in enumerate(test_dataset_l):
        stacked_img = data['img_1']
        targets = data['target_1']    

        rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)

        total_loss += rmse_loss
        toal_count += count

        print('RMSE loss is', total_loss/float(toal_count))

    for i, data in enumerate(test_dataset_p):
        stacked_img = data['img_1']
        targets = data['target_1']    
        rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)

        total_loss += rmse_loss
        toal_count += count

        print('RMSE loss is', total_loss/float(toal_count))


    print('average RMSE loss is', total_loss/float(toal_count))

print("WE ARE IN TESTING RMSE!!!!")
test(model)
print("WE ARE DONE TESTING!!!")


print("We are done")


================================================
FILE: MegaDepth/util/__init__.py
================================================


================================================
FILE: MegaDepth/util/html.py
================================================
import dominate
from dominate.tags import *
import os


class HTML:
    def __init__(self, web_dir, title, reflesh=0):
        self.title = title
        self.web_dir = web_dir
        self.img_dir = os.path.join(self.web_dir, 'images')
        if not os.path.exists(self.web_dir):
            os.makedirs(self.web_dir)
        if not os.path.exists(self.img_dir):
            os.makedirs(self.img_dir)
        # print(self.img_dir)

        self.doc = dominate.document(title=title)
        if reflesh > 0:
            with self.doc.head:
                meta(http_equiv="reflesh", content=str(reflesh))

    def get_image_dir(self):
        return self.img_dir

    def add_header(self, str):
        with self.doc:
            h3(str)

    def add_table(self, border=1):
        self.t = table(border=border, style="table-layout: fixed;")
        self.doc.add(self.t)

    def add_images(self, ims, txts, links, width=400):
        self.add_table()
        with self.t:
            with tr():
                for im, txt, link in zip(ims, txts, links):
                    with td(style="word-wrap: break-word;", halign="center", valign="top"):
                        with p():
                            with a(href=os.path.join('images', link)):
                                img(style="width:%dpx" % width, src=os.path.join('images', im))
                            br()
                            p(txt)

    def save(self):
        html_file = '%s/index.html' % self.web_dir
        f = open(html_file, 'wt')
        f.write(self.doc.render())
        f.close()


if __name__ == '__main__':
    html = HTML('web/', 'test_html')
    html.add_header('hello world')

    ims = []
    txts = []
    links = []
    for n in range(4):
        ims.append('image_%d.png' % n)
        txts.append('text_%d' % n)
        links.append('image_%d.png' % n)
    html.add_images(ims, txts, links)
    html.save()


================================================
FILE: MegaDepth/util/image_pool.py
================================================
import random
import numpy as np
import torch
from pdb import set_trace as st
from torch.autograd import Variable
class ImagePool():
    def __init__(self, pool_size):
        self.pool_size = pool_size
        if self.pool_size > 0:
            self.num_imgs = 0
            self.images = []

    def query(self, images):
        if self.pool_size == 0:
            return images
        return_images = []
        for image in images.data:
            image = torch.unsqueeze(image, 0)
            if self.num_imgs < self.pool_size:
                self.num_imgs = self.num_imgs + 1
                self.images.append(image)
                return_images.append(image)
            else:
                p = random.uniform(0, 1)
                if p > 0.5:
                    random_id = random.randint(0, self.pool_size-1)
                    tmp = self.images[random_id].clone()
                    self.images[random_id] = image
                    return_images.append(tmp)
                else:
                    return_images.append(image)
        return_images = Variable(torch.cat(return_images, 0))
        return return_images


================================================
FILE: MegaDepth/util/png.py
================================================
import struct
import zlib

def encode(buf, width, height):
  """ buf: must be bytes or a bytearray in py3, a regular string in py2. formatted RGBRGB... """
  assert (width * height * 3 == len(buf))
  bpp = 3

  def raw_data():
    # reverse the vertical line order and add null bytes at the start
    row_bytes = width * bpp
    for row_start in range((height - 1) * width * bpp, -1, -row_bytes):
      yield b'\x00'
      yield buf[row_start:row_start + row_bytes]

  def chunk(tag, data):
    return [
        struct.pack("!I", len(data)),
        tag,
        data,
        struct.pack("!I", 0xFFFFFFFF & zlib.crc32(data, zlib.crc32(tag)))
      ]

  SIGNATURE = b'\x89PNG\r\n\x1a\n'
  COLOR_TYPE_RGB = 2
  COLOR_TYPE_RGBA = 6
  bit_depth = 8
  return b''.join(
      [ SIGNATURE ] +
      chunk(b'IHDR', struct.pack("!2I5B", width, height, bit_depth, COLOR_TYPE_RGB, 0, 0, 0)) +
      chunk(b'IDAT', zlib.compress(b''.join(raw_data()), 9)) +
      chunk(b'IEND', b'')
    )


================================================
FILE: MegaDepth/util/util.py
================================================
from __future__ import print_function
import torch
import numpy as np
from PIL import Image
import inspect, re
import numpy as np
import os
import collections

# Converts a Tensor into a Numpy array
# |imtype|: the desired type of the converted numpy array
def tensor2im(image_tensor, imtype=np.uint8):
    image_numpy = image_tensor[0].cpu().float().numpy()
    image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0
    return image_numpy.astype(imtype)


def diagnose_network(net, name='network'):
    mean = 0.0
    count = 0
    for param in net.parameters():
        if param.grad is not None:
            mean += torch.mean(torch.abs(param.grad.data))
            count += 1
    if count > 0:
        mean = mean / count
    print(name)
    print(mean)


def save_image(image_numpy, image_path):
    image_pil = Image.fromarray(image_numpy)
    image_pil.save(image_path)

def info(object, spacing=10, collapse=1):
    """Print methods and doc strings.
    Takes module, class, list, dictionary, or string."""
    methodList = [e for e in dir(object) if isinstance(getattr(object, e), collections.Callable)]
    processFunc = collapse and (lambda s: " ".join(s.split())) or (lambda s: s)
    print( "\n".join(["%s %s" %
                     (method.ljust(spacing),
                      processFunc(str(getattr(object, method).__doc__)))
                     for method in methodList]) )

def varname(p):
    for line in inspect.getframeinfo(inspect.currentframe().f_back)[3]:
        m = re.search(r'\bvarname\s*\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)', line)
        if m:
            return m.group(1)

def print_numpy(x, val=True, shp=False):
    x = x.astype(np.float64)
    if shp:
        print('shape,', x.shape)
    if val:
        x = x.flatten()
        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))


def mkdirs(paths):
    if isinstance(paths, list) and not isinstance(paths, str):
        for path in paths:
            mkdir(path)
    else:
        mkdir(paths)


def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)


================================================
FILE: MegaDepth/util/visualizer.py
================================================
import numpy as np
import os
import ntpath
import time
from . import util
from . import html

class Visualizer():
    def __init__(self, opt):
        # self.opt = opt
        self.display_id = opt.display_id
        self.use_html = opt.isTrain and not opt.no_html
        self.win_size = opt.display_winsize
        self.name = opt.name
        if self.display_id > 0:
            import visdom
            self.vis = visdom.Visdom()

        if self.use_html:
            self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')
            self.img_dir = os.path.join(self.web_dir, 'images')
            print('create web directory %s...' % self.web_dir)
            util.mkdirs([self.web_dir, self.img_dir])


    # |visuals|: dictionary of images to display or save
    def display_current_results(self, visuals, epoch):
        if self.display_id > 0: # show images in the browser
            idx = 1
            for label, image_numpy in visuals.items():
                #image_numpy = np.flipud(image_numpy)
                self.vis.image(image_numpy.transpose([2,0,1]), opts=dict(title=label),
                                   win=self.display_id + idx)
                idx += 1

        if self.use_html: # save images to a html file
            for label, image_numpy in visuals.items():
                img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))
                util.save_image(image_numpy, img_path)
            # update website
            webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, reflesh=1)
            for n in range(epoch, 0, -1):
                webpage.add_header('epoch [%d]' % n)
                ims = []
                txts = []
                links = []

                for label, image_numpy in visuals.items():
                    img_path = 'epoch%.3d_%s.png' % (n, label)
                    ims.append(img_path)
                    txts.append(label)
                    links.append(img_path)
                webpage.add_images(ims, txts, links, width=self.win_size)
            webpage.save()

    # errors: dictionary of error labels and values
    def plot_current_errors(self, epoch, counter_ratio, opt, errors):
        if not hasattr(self, 'plot_data'):
            self.plot_data = {'X':[],'Y':[], 'legend':list(errors.keys())}
        self.plot_data['X'].append(epoch + counter_ratio)
        self.plot_data['Y'].append([errors[k] for k in self.plot_data['legend']])
        self.vis.line(
            X=np.stack([np.array(self.plot_data['X'])]*len(self.plot_data['legend']),1),
            Y=np.array(self.plot_data['Y']),
            opts={
                'title': self.name + ' loss over time',
                'legend': self.plot_data['legend'],
                'xlabel': 'epoch',
                'ylabel': 'loss'},
            win=self.display_id)

    # errors: same format as |errors| of plotCurrentErrors
    def print_current_errors(self, epoch, i, errors, t):
        message = '(epoch: %d, iters: %d, time: %.3f) ' % (epoch, i, t)
        for k, v in errors.items():
            message += '%s: %.3f ' % (k, v)

        print(message)

    # save image to the disk
    def save_images(self, webpage, visuals, image_path):
        image_dir = webpage.get_image_dir()
        short_path = ntpath.basename(image_path[0])
        name = os.path.splitext(short_path)[0]

        webpage.add_header(name)
        ims = []
        txts = []
        links = []

        for label, image_numpy in visuals.items():
            image_name = '%s_%s.png' % (name, label)
            save_path = os.path.join(image_dir, image_name)
            util.save_image(image_numpy, save_path)

            ims.append(image_name)
            txts.append(label)
            links.append(image_name)
        webpage.add_images(ims, txts, links, width=self.win_size)


================================================
FILE: PWCNet/PWCNet.py
================================================
"""
implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018

Jinwei Gu and Zhile Ren

"""

import torch
import torch.nn as nn
from torch.autograd import Variable
import os
os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory 
#from .correlation_package.modules.corr import Correlation
# from PWCNet.correlation_package_pytorch0_4.correlation import Correlation #pytorch0.4 version
from PWCNet.correlation_package_pytorch1_0.correlation import Correlation #pytorch0.4 version

import numpy as np


__all__ = [
    'pwc_dc_net', 'pwc_dc_net_old'
    ]

def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):   
    return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, 
                        padding=padding, dilation=dilation, bias=True),
            nn.LeakyReLU(0.1))

def predict_flow(in_planes):
    return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)

def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
    return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)

import time

class PWCDCNet(nn.Module):
    """
    PWC-DC net. add dilation convolution and densenet connections

    """
    def __init__(self, md=4):
        """
        input: md --- maximum displacement (for correlation. default: 4), after warpping

        """
        super(PWCDCNet,self).__init__()

        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)
        self.conv1aa = conv(16,  16, kernel_size=3, stride=1)
        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)
        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)
        self.conv2aa = conv(32,  32, kernel_size=3, stride=1)
        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)
        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)
        self.conv3aa = conv(64,  64, kernel_size=3, stride=1)
        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)
        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)
        self.conv4aa = conv(96,  96, kernel_size=3, stride=1)
        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)
        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)
        self.conv5aa = conv(128,128, kernel_size=3, stride=1)
        self.conv5b  = conv(128,128, kernel_size=3, stride=1)
        self.conv6aa = conv(128,196, kernel_size=3, stride=2)
        self.conv6a  = conv(196,196, kernel_size=3, stride=1)
        self.conv6b  = conv(196,196, kernel_size=3, stride=1)

        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
        self.leakyRELU = nn.LeakyReLU(0.1)
        
        nd = (2*md+1)**2
        dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)
        dd = [int(d) for d in dd]

        od = nd
        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        
        self.predict_flow6 = predict_flow(od+dd[4])
        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+128+4
        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow5 = predict_flow(od+dd[4]) 
        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+96+4
        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow4 = predict_flow(od+dd[4]) 
        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+64+4
        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow3 = predict_flow(od+dd[4]) 
        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+32+4
        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow2 = predict_flow(od+dd[4]) 
        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        
        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)
        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)
        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)
        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)
        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv7 = predict_flow(32)

        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal_(m.weight.data, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()

        W_MAX = 2048
        H_MAX = 1024
        B_MAX = 3
        xx = torch.arange(0, W_MAX).view(1,-1).cuda().repeat(H_MAX,1)
        yy = torch.arange(0, H_MAX).view(-1,1).cuda().repeat(1,W_MAX)
        xx = xx.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)
        yy = yy.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)
        grid = torch.cat((xx,yy),1).float()

        ## for saving time on allocating a grid in forward
        self.W_MAX = W_MAX
        self.H_MAX = H_MAX
        self.B_MAX = B_MAX
        self.grid = Variable(grid, requires_grad=False)
        # self.mask_base = Variable(torch.cuda.FloatTensor().resize_(B_MAX,).zero_() + 1)


    def warp(self, x, flo):
        """
        warp an image/tensor (im2) back to im1, according to the optical flow

        x: [B, C, H, W] (im2)
        flo: [B, 2, H, W] flow

        """
        B, C, H, W = x.size()
        # mesh grid 
        # xx = torch.arange(0, W).view(1,-1).cuda().repeat(H,1)
        # yy = torch.arange(0, H).view(-1,1).cuda().repeat(1,W)
        # xx = xx.view(1,1,H,W).repeat(B,1,1,1)
        # yy = yy.view(1,1,H,W).repeat(B,1,1,1)
        # grid = torch.cat((xx,yy),1).float()

        # # if x.is_cuda:
        # #     grid = grid.cuda()
        # vgrid = Variable(grid) + flo
        assert(B <= self.B_MAX and H <= self.H_MAX and W <= self.W_MAX)
        vgrid = self.grid[:B,:,:H,:W] +flo

        # scale grid to [-1,1] 
        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone()/max(W-1,1)-1.0
        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone()/max(H-1,1)-1.0


        vgrid = vgrid.permute(0,2,3,1)        
        output = nn.functional.grid_sample(x, vgrid)
        # mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
        mask = torch.autograd.Variable(torch.cuda.FloatTensor().resize_(x.size()).zero_() + 1, requires_grad = False)
        mask = nn.functional.grid_sample(mask, vgrid)

        # if W==128:
            # np.save('mask.npy', mask.cpu().data.numpy())
            # np.save('warp.npy', output.cpu().data.numpy())
        
        mask[mask<0.9999] = 0
        mask[mask>0] = 1
        
        return output*mask


    def forward(self,x, output_more = False):
        im1 = x[:,:3,:,:]
        im2 = x[:,3:,:,:]
        # print("\n\n***************************PWC Net details *************** \n\n")
        # start=  time.time()
        c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))
        c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))
        c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))
        c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))
        c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))
        c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))
        c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))
        c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))
        c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))
        c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))
        c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))
        c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))
        # print("features " +str(time.time()- start))
        # start=  time.time()
        corr6 = self.corr(c16, c26) 
        corr6 = self.leakyRELU(corr6)   


        x = torch.cat((self.conv6_0(corr6), corr6),1)
        x = torch.cat((self.conv6_1(x), x),1)
        x = torch.cat((self.conv6_2(x), x),1)
        x = torch.cat((self.conv6_3(x), x),1)
        x = torch.cat((self.conv6_4(x), x),1)
        flow6 = self.predict_flow6(x)
        up_flow6 = self.deconv6(flow6)
        up_feat6 = self.upfeat6(x)
        # print("level6 " +str(time.time()- start))
        # start=  time.time()
        
        warp5 = self.warp(c25, up_flow6*0.625)
        # print("level5_1 " + str(time.time() - start))
        # start5 = time.time()
        corr5 = self.corr(c15, warp5)
        # print("level5_2 " + str(time.time() - start5))
        # start5 = time.time()
        corr5 = self.leakyRELU(corr5)

        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
        x = torch.cat((self.conv5_0(x), x),1)
        x = torch.cat((self.conv5_1(x), x),1)
        x = torch.cat((self.conv5_2(x), x),1)
        x = torch.cat((self.conv5_3(x), x),1)
        x = torch.cat((self.conv5_4(x), x),1)

        flow5 = self.predict_flow5(x)
        up_flow5 = self.deconv5(flow5)
        up_feat5 = self.upfeat5(x)
        # print("level5_3 " + str(time.time() - start5))
        # print("level5 " + str(time.time() - start))
        # start = time.time()

        warp4 = self.warp(c24, up_flow5*1.25)
        corr4 = self.corr(c14, warp4)  
        corr4 = self.leakyRELU(corr4)
        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
        x = torch.cat((self.conv4_0(x), x),1)
        x = torch.cat((self.conv4_1(x), x),1)
        x = torch.cat((self.conv4_2(x), x),1)
        x = torch.cat((self.conv4_3(x), x),1)
        x = torch.cat((self.conv4_4(x), x),1)
        flow4 = self.predict_flow4(x)
        up_flow4 = self.deconv4(flow4)
        up_feat4 = self.upfeat4(x)

        # print("level4 " + str(time.time() - start))
        # start = time.time()

        warp3 = self.warp(c23, up_flow4*2.5)
        corr3 = self.corr(c13, warp3) 
        corr3 = self.leakyRELU(corr3)
        

        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
        x = torch.cat((self.conv3_0(x), x),1)
        x = torch.cat((self.conv3_1(x), x),1)
        x = torch.cat((self.conv3_2(x), x),1)
        x = torch.cat((self.conv3_3(x), x),1)
        x = torch.cat((self.conv3_4(x), x),1)
        flow3 = self.predict_flow3(x)
        up_flow3 = self.deconv3(flow3)
        up_feat3 = self.upfeat3(x)

        # print("level3 " + str(time.time() - start))
        # start = time.time()

        warp2 = self.warp(c22, up_flow3*5.0) 
        corr2 = self.corr(c12, warp2)
        corr2 = self.leakyRELU(corr2)
        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
        x = torch.cat((self.conv2_0(x), x),1)
        x = torch.cat((self.conv2_1(x), x),1)
        x = torch.cat((self.conv2_2(x), x),1)
        x = torch.cat((self.conv2_3(x), x),1)
        x = torch.cat((self.conv2_4(x), x),1)
        flow2 = self.predict_flow2(x)
        # print("level2 " + str(time.time() - start))
        # start = time.time()

        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
        # print("refine " + str(time.time() - start))
        # start = time.time()

        # we don't have the gt for flow, we just fine tune it on flownets
        if not output_more:
            return flow2
        else:
            return [flow2,flow3,flow4,flow5,flow6]
        # if self.training:
        #     return flow2,flow3,flow4,flow5,flow6
        # else:
        #     return flow2


class PWCDCNet_old(nn.Module):
    """
    PWC-DC net. add dilation convolution and densenet connections

    """
    def __init__(self, md=4):
        """
        input: md --- maximum displacement (for correlation. default: 4), after warpping

        """
        super(PWCDCNet_old,self).__init__()

        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)
        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)
        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)
        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)
        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)
        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)
        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)
        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)
        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)
        self.conv5b  = conv(128,128, kernel_size=3, stride=1)
        self.conv6a  = conv(128,196, kernel_size=3, stride=2)
        self.conv6b  = conv(196,196, kernel_size=3, stride=1)

        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
        self.leakyRELU = nn.LeakyReLU(0.1)
        
        nd = (2*md+1)**2
        dd = np.cumsum([128,128,96,64,32])

        od = nd
        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        
        self.predict_flow6 = predict_flow(od+dd[4])
        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+128+4
        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow5 = predict_flow(od+dd[4]) 
        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+96+4
        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow4 = predict_flow(od+dd[4]) 
        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+64+4
        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow3 = predict_flow(od+dd[4]) 
        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+32+4
        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow2 = predict_flow(od+dd[4]) 
        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        
        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)
        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)
        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)
        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)
        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv7 = predict_flow(32)

        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal(m.weight.data, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()


    def warp(self, x, flo):
        """
        warp an image/tensor (im2) back to im1, according to the optical flow

        x: [B, C, H, W] (im2)
        flo: [B, 2, H, W] flow

        """
        B, C, H, W = x.size()
        # mesh grid 
        xx = torch.arange(0, W).view(1,-1).repeat(H,1)
        yy = torch.arange(0, H).view(-1,1).repeat(1,W)
        xx = xx.view(1,1,H,W).repeat(B,1,1,1)
        yy = yy.view(1,1,H,W).repeat(B,1,1,1)
        grid = torch.cat((xx,yy),1).float()

        if x.is_cuda:
            grid = grid.cuda()
        vgrid = Variable(grid) + flo

        # scale grid to [-1,1] 
        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0

        vgrid = vgrid.permute(0,2,3,1)        
        output = nn.functional.grid_sample(x, vgrid)
        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
        mask = nn.functional.grid_sample(mask, vgrid)
        
        mask[mask<0.999] = 0
        mask[mask>0] = 1
        
        return output*mask


    def forward(self,x):
        im1 = x[:,:3,:,:]
        im2 = x[:,3:,:,:]
        
        c11 = self.conv1b(self.conv1a(im1))
        c21 = self.conv1b(self.conv1a(im2))
        c12 = self.conv2b(self.conv2a(c11))
        c22 = self.conv2b(self.conv2a(c21))
        c13 = self.conv3b(self.conv3a(c12))
        c23 = self.conv3b(self.conv3a(c22))
        c14 = self.conv4b(self.conv4a(c13))
        c24 = self.conv4b(self.conv4a(c23))        
        c15 = self.conv5b(self.conv5a(c14))
        c25 = self.conv5b(self.conv5a(c24))
        c16 = self.conv6b(self.conv6a(c15))
        c26 = self.conv6b(self.conv6a(c25))
        
        corr6 = self.corr(c16, c26) 
        corr6 = self.leakyRELU(corr6)        
        x = torch.cat((corr6, self.conv6_0(corr6)),1)
        x = torch.cat((self.conv6_1(x), x),1)
        x = torch.cat((x, self.conv6_2(x)),1)
        x = torch.cat((x, self.conv6_3(x)),1)
        x = torch.cat((x, self.conv6_4(x)),1)
        flow6 = self.predict_flow6(x)
        up_flow6 = self.deconv6(flow6)
        up_feat6 = self.upfeat6(x)
        
        warp5 = self.warp(c25, up_flow6*0.625)
        corr5 = self.corr(c15, warp5) 
        corr5 = self.leakyRELU(corr5)
        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
        x = torch.cat((x, self.conv5_0(x)),1)
        x = torch.cat((self.conv5_1(x), x),1)
        x = torch.cat((x, self.conv5_2(x)),1)
        x = torch.cat((x, self.conv5_3(x)),1)
        x = torch.cat((x, self.conv5_4(x)),1)
        flow5 = self.predict_flow5(x)
        up_flow5 = self.deconv5(flow5)
        up_feat5 = self.upfeat5(x)
        
        warp4 = self.warp(c24, up_flow5*1.25)
        corr4 = self.corr(c14, warp4)  
        corr4 = self.leakyRELU(corr4)
        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
        x = torch.cat((x, self.conv4_0(x)),1)
        x = torch.cat((self.conv4_1(x), x),1)
        x = torch.cat((x, self.conv4_2(x)),1)
        x = torch.cat((x, self.conv4_3(x)),1)
        x = torch.cat((x, self.conv4_4(x)),1)
        flow4 = self.predict_flow4(x)
        up_flow4 = self.deconv4(flow4)
        up_feat4 = self.upfeat4(x)

        warp3 = self.warp(c23, up_flow4*2.5)
        corr3 = self.corr(c13, warp3) 
        corr3 = self.leakyRELU(corr3)
        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
        x = torch.cat((x, self.conv3_0(x)),1)
        x = torch.cat((self.conv3_1(x), x),1)
        x = torch.cat((x, self.conv3_2(x)),1)
        x = torch.cat((x, self.conv3_3(x)),1)
        x = torch.cat((x, self.conv3_4(x)),1)
        flow3 = self.predict_flow3(x)
        up_flow3 = self.deconv3(flow3)
        up_feat3 = self.upfeat3(x)
        
        warp2 = self.warp(c22, up_flow3*5.0) 
        corr2 = self.corr(c12, warp2)
        corr2 = self.leakyRELU(corr2)
        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
        x = torch.cat((x, self.conv2_0(x)),1)
        x = torch.cat((self.conv2_1(x), x),1)
        x = torch.cat((x, self.conv2_2(x)),1)
        x = torch.cat((x, self.conv2_3(x)),1)
        x = torch.cat((x, self.conv2_4(x)),1)
        flow2 = self.predict_flow2(x)
 
        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
        
        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2


def pwc_dc_net(path=None):

    model = PWCDCNet()
    if path is not None:
        data = torch.load(path)
        if 'state_dict' in data.keys():
            model.load_state_dict(data['state_dict'])
        else:
            model.load_state_dict(data)
    return model


def pwc_dc_net_old(path=None):

    model = PWCDCNet_old()
    if path is not None:
        data = torch.load(path)
        if 'state_dict' in data.keys():
            model.load_state_dict(data['state_dict'])
        else:
            model.load_state_dict(data)
    return model


================================================
FILE: PWCNet/__init__.py
================================================
from .PWCNet import *

================================================
FILE: PWCNet/correlation_package_pytorch1_0/__init__.py
================================================


================================================
FILE: PWCNet/correlation_package_pytorch1_0/build.sh
================================================
#!/usr/bin/env bash

echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0

export PYTHONPATH=$PYTHONPATH:$(pwd)/../../my_package

rm -rf build *.egg-info dist
python setup.py install


================================================
FILE: PWCNet/correlation_package_pytorch1_0/clean.sh
================================================
#!/usr/bin/env bash

echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0


rm -rf build *.egg-info dist
#python setup.py install


================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation.py
================================================
import torch
from torch.nn.modules.module import Module
from torch.autograd import Function
import correlation_cuda

class CorrelationFunction(Function):

    def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
        super(CorrelationFunction, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply
        # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)

    def forward(self, input1, input2):
        self.save_for_backward(input1, input2)

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()
            output = input1.new()

            correlation_cuda.forward(input1, input2, rbot1, rbot2, output, 
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return output

    def backward(self, grad_output):
        input1, input2 = self.saved_tensors

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()

            grad_input1 = input1.new()
            grad_input2 = input2.new()

            correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return grad_input1, grad_input2


class Correlation(Module):
    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
        super(Correlation, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply

    def forward(self, input1, input2):

        result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2)

        return result


================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0
#include "correlation_cuda_kernel.cuh"

int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);

  int nInputChannels = input1.size(1);
  int inputHeight = input1.size(2);
  int inputWidth = input1.size(3);

  int kernel_radius = (kernel_size - 1) / 2;
  int border_radius = kernel_radius + max_displacement;

  int paddedInputHeight = inputHeight + 2 * pad_size;
  int paddedInputWidth = inputWidth + 2 * pad_size;

  int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);

  int outputHeight = ceil(static_cast<float>(paddedInputHeight - 2 * border_radius) / static_cast<float>(stride1));
  int outputwidth = ceil(static_cast<float>(paddedInputWidth - 2 * border_radius) / static_cast<float>(stride1));

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});

  rInput1.fill_(0);
  rInput2.fill_(0);
  output.fill_(0);

  int success = correlation_forward_cuda_kernel(
    output,
    output.size(0), 
    output.size(1),
    output.size(2),
    output.size(3),
    output.stride(0),
    output.stride(1),
    output.stride(2),
    output.stride(3),
    input1,
    input1.size(1),
    input1.size(2),
    input1.size(3),
    input1.stride(0),
    input1.stride(1),
    input1.stride(2),
    input1.stride(3),
    input2,
    input2.size(1),
    input2.stride(0),
    input2.stride(1),
    input2.stride(2),
    input2.stride(3),
    rInput1,
    rInput2,
    pad_size,     
    kernel_size,
    max_displacement,
    stride1,
    stride2,
    corr_type_multiply,
//			at::globalContext().getCurrentCUDAStream() //works for 0.4.1
           at::cuda::getCurrentCUDAStream() //works for 1.0.0
  );

  //check for errors
  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;

}

int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, 
                       at::Tensor& gradInput1, at::Tensor& gradInput2,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);
  int nInputChannels = input1.size(1);
  int paddedInputHeight = input1.size(2)+ 2 * pad_size;
  int paddedInputWidth = input1.size(3)+ 2 * pad_size;

  int height = input1.size(2);
  int width = input1.size(3);

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  gradInput1.resize_({batchSize, nInputChannels, height, width});
  gradInput2.resize_({batchSize, nInputChannels, height, width});

  rInput1.fill_(0);
  rInput2.fill_(0);
  gradInput1.fill_(0);
  gradInput2.fill_(0);

  int success = correlation_backward_cuda_kernel(gradOutput,
                                                gradOutput.size(0),
                                                gradOutput.size(1),
                                                gradOutput.size(2),
                                                gradOutput.size(3),
                                                gradOutput.stride(0),
                                                gradOutput.stride(1),
                                                gradOutput.stride(2),
                                                gradOutput.stride(3),
                                                input1,
                                                input1.size(1),
                                                input1.size(2),
                                                input1.size(3),
                                                input1.stride(0),
                                                input1.stride(1),
                                                input1.stride(2),
                                                input1.stride(3),
                                                input2,  
                                                input2.stride(0),
                                                input2.stride(1),
                                                input2.stride(2),
                                                input2.stride(3),
                                                gradInput1,
                                                gradInput1.stride(0),
                                                gradInput1.stride(1),
                                                gradInput1.stride(2),
                                                gradInput1.stride(3),
                                                gradInput2,
                                                gradInput2.size(1),
                                                gradInput2.stride(0),
                                                gradInput2.stride(1),
                                                gradInput2.stride(2),
                                                gradInput2.stride(3),
                                                rInput1,
                                                rInput2,
                                                pad_size,
                                                kernel_size,
                                                max_displacement,
                                                stride1, 
                                                stride2,
                                                corr_type_multiply,
//			at::globalContext().getCurrentCUDAStream() //works for 0.4.1
           at::cuda::getCurrentCUDAStream() //works for 1.0.0
                                               );

  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)");
  m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)");
}


================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "correlation_cuda_kernel.cuh"

#define CUDA_NUM_THREADS 1024
#define THREADS_PER_BLOCK 32
#define FULL_MASK 0xffffffff

#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

using at::Half;

template<typename scalar_t>
__forceinline__ __device__ scalar_t warpReduceSum(scalar_t val) {
        for (int offset = 16; offset > 0; offset /= 2)
                val += __shfl_down_sync(FULL_MASK, val, offset);
        return val;
}

template<typename scalar_t>
__forceinline__ __device__ scalar_t blockReduceSum(scalar_t val) {

        static __shared__ scalar_t shared[32];
        int lane = threadIdx.x % warpSize;
        int wid = threadIdx.x / warpSize;

        val = warpReduceSum(val);

        if (lane == 0)
                shared[wid] = val;

        __syncthreads();

        val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

        if (wid == 0)
                val = warpReduceSum(val);

        return val;
}


template <typename scalar_t>
__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)
{

    // n (batch size), c (num of channels), y (height), x (width)
    int n = blockIdx.x;
    int y = blockIdx.y;
    int x = blockIdx.z;

    int ch_off = threadIdx.x;
    scalar_t value;

    int dimcyx = channels * height * width;
    int dimyx = height * width;

    int p_dimx = (width + 2 * pad_size);
    int p_dimy = (height + 2 * pad_size);
    int p_dimyxc = channels * p_dimy * p_dimx;
    int p_dimxc = p_dimx * channels;

    for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {
      value = input[n * dimcyx + c * dimyx + y * width + x];
      rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;
    }
}


template<typename scalar_t>
__global__ void correlation_forward(scalar_t* __restrict__ output, const int nOutputChannels,
                const int outputHeight, const int outputWidth, const scalar_t* __restrict__ rInput1,
                const int nInputChannels, const int inputHeight, const int inputWidth,
                const scalar_t* __restrict__ rInput2, const int pad_size, const int kernel_size,
                const int max_displacement, const int stride1, const int stride2) {

        int32_t pInputWidth = inputWidth + 2 * pad_size;
        int32_t pInputHeight = inputHeight + 2 * pad_size;

        int32_t kernel_rad = (kernel_size - 1) / 2;

        int32_t displacement_rad = max_displacement / stride2;

        int32_t displacement_size = 2 * displacement_rad + 1;

        int32_t n = blockIdx.x;
        int32_t y1 = blockIdx.y * stride1 + max_displacement;
        int32_t x1 = blockIdx.z * stride1 + max_displacement;
        int32_t c = threadIdx.x;

        int32_t pdimyxc = pInputHeight * pInputWidth * nInputChannels;

        int32_t pdimxc = pInputWidth * nInputChannels;

        int32_t pdimc = nInputChannels;

        int32_t tdimcyx = nOutputChannels * outputHeight * outputWidth;
        int32_t tdimyx = outputHeight * outputWidth;
        int32_t tdimx = outputWidth;

        int32_t nelems = kernel_size * kernel_size * pdimc;

        // element-wise product along channel axis
        for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
                for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
                        int x2 = x1 + ti * stride2;
                        int y2 = y1 + tj * stride2;

                        float acc0 = 0.0f;

                        for (int j = -kernel_rad; j <= kernel_rad; ++j) {
                                for (int i = -kernel_rad; i <= kernel_rad; ++i) {
                                        // THREADS_PER_BLOCK
                                        #pragma unroll
                                        for (int ch = c; ch < pdimc; ch += blockDim.x) {

                                                int indx1 = n * pdimyxc + (y1 + j) * pdimxc
                                                                + (x1 + i) * pdimc + ch;
                                                int indx2 = n * pdimyxc + (y2 + j) * pdimxc
                                                                + (x2 + i) * pdimc + ch;
                                                acc0 += static_cast<float>(rInput1[indx1] * rInput2[indx2]);
                                        }
                                }
                        }

                        if (blockDim.x == warpSize) {
                            __syncwarp();
                            acc0 = warpReduceSum(acc0);
                        } else {
                            __syncthreads();
                            acc0 = blockReduceSum(acc0);
                        }

                        if (threadIdx.x == 0) {

                                int tc = (tj + displacement_rad) * displacement_size
                                                + (ti + displacement_rad);
                                const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx
                                                + blockIdx.z;
                                output[tindx] = static_cast<scalar_t>(acc0 / nelems);
                        }
            }
        }
}


template <typename scalar_t>
__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth, 
                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth, 
                                            const scalar_t* __restrict__ rInput2, 
                                            int pad_size,
                                            int kernel_size,
                                            int max_displacement,
                                            int stride1,
                                            int stride2)
  {
    // n (batch size), c (num of channels), y (height), x (width)

    int n = item; 
    int y = blockIdx.x * stride1 + pad_size;
    int x = blockIdx.y * stride1 + pad_size;
    int c = blockIdx.z;
    int tch_off = threadIdx.x;

    int kernel_rad = (kernel_size - 1) / 2;
    int displacement_rad = max_displacement / stride2;
    int displacement_size = 2 * displacement_rad + 1;

    int xmin = (x - kernel_rad - max_displacement) / stride1;
    int ymin = (y - kernel_rad - max_displacement) / stride1;

    int xmax = (x + kernel_rad - max_displacement) / stride1;
    int ymax = (y + kernel_rad - max_displacement) / stride1;

    if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
        // assumes gradInput1 is pre-allocated and zero filled
      return;
    }

    if (xmin > xmax || ymin > ymax) {
        // assumes gradInput1 is pre-allocated and zero filled
        return;
    }

    xmin = max(0,xmin);
    xmax = min(outputWidth-1,xmax);

    ymin = max(0,ymin);
    ymax = min(outputHeight-1,ymax);

    int pInputWidth = inputWidth + 2 * pad_size;
    int pInputHeight = inputHeight + 2 * pad_size;

    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
    int pdimxc = pInputWidth * nInputChannels;
    int pdimc = nInputChannels;

    int tdimcyx = nOutputChannels * outputHeight * outputWidth;
    int tdimyx = outputHeight * outputWidth;
    int tdimx = outputWidth;

    int odimcyx = nInputChannels * inputHeight* inputWidth;
    int odimyx = inputHeight * inputWidth;
    int odimx = inputWidth;

    scalar_t nelems = kernel_size * kernel_size * nInputChannels;

    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
    prod_sum[tch_off] = 0;

    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {

      int i2 = (tc % displacement_size - displacement_rad) * stride2;
      int j2 = (tc / displacement_size - displacement_rad) * stride2;

      int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;
      
      scalar_t val2 = rInput2[indx2];

      for (int j = ymin; j <= ymax; ++j) {
        for (int i = xmin; i <= xmax; ++i) {
          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
          prod_sum[tch_off] += gradOutput[tindx] * val2;
        }
      }
    }
    __syncthreads();

    if(tch_off == 0) {
      scalar_t reduce_sum = 0;
      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
          reduce_sum += prod_sum[idx];
      }
      const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
      gradInput1[indx1] = reduce_sum / nelems;
    }

}

template <typename scalar_t>
__global__ void correlation_backward_input2(int item, scalar_t*  gradInput2, int nInputChannels, int inputHeight, int inputWidth,
                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
                                            const scalar_t* __restrict__ rInput1,
                                            int pad_size,
                                            int kernel_size,
                                            int max_displacement,
                                            int stride1,
                                            int stride2)
{
    // n (batch size), c (num of channels), y (height), x (width)

    int n = item;
    int y = blockIdx.x * stride1 + pad_size;
    int x = blockIdx.y * stride1 + pad_size;
    int c = blockIdx.z;

    int tch_off = threadIdx.x;

    int kernel_rad = (kernel_size - 1) / 2;
    int displacement_rad = max_displacement / stride2;
    int displacement_size = 2 * displacement_rad + 1;

    int pInputWidth = inputWidth + 2 * pad_size;
    int pInputHeight = inputHeight + 2 * pad_size;

    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
    int pdimxc = pInputWidth * nInputChannels;
    int pdimc = nInputChannels;

    int tdimcyx = nOutputChannels * outputHeight * outputWidth;
    int tdimyx = outputHeight * outputWidth;
    int tdimx = outputWidth;

    int odimcyx = nInputChannels * inputHeight* inputWidth;
    int odimyx = inputHeight * inputWidth;
    int odimx = inputWidth;

    scalar_t nelems = kernel_size * kernel_size * nInputChannels;

    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
    prod_sum[tch_off] = 0;

    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
      int i2 = (tc % displacement_size - displacement_rad) * stride2;
      int j2 = (tc / displacement_size - displacement_rad) * stride2;

      int xmin = (x - kernel_rad - max_displacement - i2) / stride1;
      int ymin = (y - kernel_rad - max_displacement - j2) / stride1;

      int xmax = (x + kernel_rad - max_displacement - i2) / stride1;
      int ymax = (y + kernel_rad - max_displacement - j2) / stride1;

      if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
          // assumes gradInput2 is pre-allocated and zero filled
        continue;
      }

      if (xmin > xmax || ymin > ymax) {
          // assumes gradInput2 is pre-allocated and zero filled
          continue;
      }

      xmin = max(0,xmin);
      xmax = min(outputWidth-1,xmax);

      ymin = max(0,ymin);
      ymax = min(outputHeight-1,ymax);
      
      int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;
      scalar_t val1 = rInput1[indx1];

      for (int j = ymin; j <= ymax; ++j) {
        for (int i = xmin; i <= xmax; ++i) {
          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
          prod_sum[tch_off] += gradOutput[tindx] * val1;
        }
      }
    }

    __syncthreads();

    if(tch_off == 0) {
      scalar_t reduce_sum = 0;
      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
          reduce_sum += prod_sum[idx];
      }
      const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
      gradInput2[indx2] = reduce_sum / nelems;
    }

}

int correlation_forward_cuda_kernel(at::Tensor& output,
                                    int ob,
                                    int oc,
                                    int oh,
                                    int ow,
                                    int osb,
                                    int osc,
                                    int osh,
                                    int osw,

                                    at::Tensor& input1,
                                    int ic,
                                    int ih,
                                    int iw,
                                    int isb,
                                    int isc,
                                    int ish,
                                    int isw,

                                    at::Tensor& input2,
                                    int gc,
                                    int gsb,
                                    int gsc,
                                    int gsh,
                                    int gsw,

                                    at::Tensor& rInput1,
                                    at::Tensor& rInput2,
                                    int pad_size,
                                    int kernel_size,
                                    int max_displacement,
                                    int stride1,
                                    int stride2,
                                    int corr_type_multiply,
                                    cudaStream_t stream) 
{

   int batchSize = ob;

   int nInputChannels = ic;
   int inputWidth = iw;
   int inputHeight = ih;

   int nOutputChannels = oc;
   int outputWidth = ow;
   int outputHeight = oh;

   dim3 blocks_grid(batchSize, inputHeight, inputWidth);
   dim3 threads_block(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] {

  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>>(
      input1.data<scalar_t>(), rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

  }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] {

  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>> (
      input2.data<scalar_t>(), rInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

  }));

   dim3 threadsPerBlock(THREADS_PER_BLOCK);
   dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] {

   correlation_forward<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> 
                        (output.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
                         rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
                         rInput2.data<scalar_t>(),
                         pad_size,
                         kernel_size,
                         max_displacement,
                         stride1,
                         stride2);

  }));

  cudaError_t err = cudaGetLastError();


  // check for errors
  if (err != cudaSuccess) {
    printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err));
    return 0;
  }

  return 1;
}


int correlation_backward_cuda_kernel(
                                    at::Tensor& gradOutput,
                                    int gob,
                                    int goc,
                                    int goh,
                                    int gow,
                                    int gosb,
                                    int gosc,
                                    int gosh,
                                    int gosw,

                                    at::Tensor& input1,
                                    int ic,
                                    int ih,
                                    int iw,
                                    int isb,
                                    int isc,
                                    int ish,
                                    int isw,

                                    at::Tensor& input2,
                                    int gsb,
                                    int gsc,
                                    int gsh,
                                    int gsw,

                                    at::Tensor& gradInput1,
                                    int gisb,
                                    int gisc,
                                    int gish,
                                    int gisw,

                                    at::Tensor& gradInput2,
                                    int ggc,
                                    int ggsb,
                                    int ggsc,
                                    int ggsh,
                                    int ggsw,

                                    at::Tensor& rInput1,
                                    at::Tensor& rInput2,
                                    int pad_size,
                                    int kernel_size,
                                    int max_displacement,
                                    int stride1,
                                    int stride2,
                                    int corr_type_multiply,
                                    cudaStream_t stream)
{

    int batchSize = gob;
    int num = batchSize;

    int nInputChannels = ic;
    int inputWidth = iw;
    int inputHeight = ih;

    int nOutputChannels = goc;
    int outputWidth = gow;
    int outputHeight = goh;

    dim3 blocks_grid(batchSize, inputHeight, inputWidth);
    dim3 threads_block(THREADS_PER_BLOCK);


    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] {

        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(
            input1.data<scalar_t>(),
            rInput1.data<scalar_t>(),
            nInputChannels,
            inputHeight,
            inputWidth,
            pad_size
        );
    }));

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {

        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(
            input2.data<scalar_t>(),
            rInput2.data<scalar_t>(),
            nInputChannels,
            inputHeight,
            inputWidth,
            pad_size
        );
    }));

    dim3 threadsPerBlock(THREADS_PER_BLOCK);
    dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);

    for (int n = 0; n < num; ++n) {

      AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {


          correlation_backward_input1<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> (
              n, gradInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
              gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
              rInput2.data<scalar_t>(),
              pad_size,
              kernel_size,
              max_displacement,
              stride1,
              stride2);
      }));
    }

    for(int n = 0; n < batchSize; n++) {

      AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] {

        correlation_backward_input2<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>>(
            n, gradInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
            gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
            rInput1.data<scalar_t>(),
            pad_size,
            kernel_size,
            max_displacement,
            stride1,
            stride2);

        }));
    }

  // check for errors
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err));
    return 0;
  }

  return 1;
}


================================================
FILE: PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int correlation_forward_cuda_kernel(at::Tensor& output,
    int ob,
    int oc,
    int oh,
    int ow,
    int osb,
    int osc,
    int osh,
    int osw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gc,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


int correlation_backward_cuda_kernel(   
    at::Tensor& gradOutput,
    int gob,
    int goc,
    int goh,
    int gow,
    int gosb,
    int gosc,
    int gosh,
    int gosw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& gradInput1, 
    int gisb,
    int gisc,
    int gish,
    int gisw,

    at::Tensor& gradInput2,
    int ggc,
    int ggsb,
    int ggsc,
    int ggsh,
    int ggsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


================================================
FILE: PWCNet/correlation_package_pytorch1_0/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='correlation_cuda',
    ext_modules=[
        CUDAExtension('correlation_cuda', [
            'correlation_cuda.cc',
            'correlation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: PWCNet/models/PWCNet.py
================================================
"""
implementation of the PWC-DC network for optical flow estimation by Sun et al., 2018

Jinwei Gu and Zhile Ren

"""

import torch
import torch.nn as nn
from torch.autograd import Variable
import os
os.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory 
from correlation_package.modules.corr import Correlation 
import numpy as np


__all__ = [
    'pwc_dc_net', 'pwc_dc_net_old'
    ]

def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):   
    return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, 
                        padding=padding, dilation=dilation, bias=True),
            nn.LeakyReLU(0.1))

def predict_flow(in_planes):
    return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)

def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
    return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)


class PWCDCNet(nn.Module):
    """
    PWC-DC net. add dilation convolution and densenet connections

    """
    def __init__(self, md=4):
        """
        input: md --- maximum displacement (for correlation. default: 4), after warpping

        """
        super(PWCDCNet,self).__init__()

        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)
        self.conv1aa = conv(16,  16, kernel_size=3, stride=1)
        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)
        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)
        self.conv2aa = conv(32,  32, kernel_size=3, stride=1)
        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)
        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)
        self.conv3aa = conv(64,  64, kernel_size=3, stride=1)
        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)
        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)
        self.conv4aa = conv(96,  96, kernel_size=3, stride=1)
        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)
        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)
        self.conv5aa = conv(128,128, kernel_size=3, stride=1)
        self.conv5b  = conv(128,128, kernel_size=3, stride=1)
        self.conv6aa = conv(128,196, kernel_size=3, stride=2)
        self.conv6a  = conv(196,196, kernel_size=3, stride=1)
        self.conv6b  = conv(196,196, kernel_size=3, stride=1)

        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
        self.leakyRELU = nn.LeakyReLU(0.1)
        
        nd = (2*md+1)**2
        dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)
        dd = [int(d) for d in dd]

        od = nd
        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        
        self.predict_flow6 = predict_flow(od+dd[4])
        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+128+4
        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow5 = predict_flow(od+dd[4]) 
        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+96+4
        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow4 = predict_flow(od+dd[4]) 
        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+64+4
        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow3 = predict_flow(od+dd[4]) 
        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+32+4
        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow2 = predict_flow(od+dd[4]) 
        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        
        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)
        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)
        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)
        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)
        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv7 = predict_flow(32)

        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal(m.weight.data, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()


    def warp(self, x, flo):
        """
        warp an image/tensor (im2) back to im1, according to the optical flow

        x: [B, C, H, W] (im2)
        flo: [B, 2, H, W] flow

        """
        B, C, H, W = x.size()
        # mesh grid 
        xx = torch.arange(0, W).view(1,-1).repeat(H,1)
        yy = torch.arange(0, H).view(-1,1).repeat(1,W)
        xx = xx.view(1,1,H,W).repeat(B,1,1,1)
        yy = yy.view(1,1,H,W).repeat(B,1,1,1)
        grid = torch.cat((xx,yy),1).float()

        if x.is_cuda:
            grid = grid.cuda()
        vgrid = Variable(grid) + flo

        # scale grid to [-1,1] 
        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0

        vgrid = vgrid.permute(0,2,3,1)        
        output = nn.functional.grid_sample(x, vgrid)
        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
        mask = nn.functional.grid_sample(mask, vgrid)

        # if W==128:
            # np.save('mask.npy', mask.cpu().data.numpy())
            # np.save('warp.npy', output.cpu().data.numpy())
        
        mask[mask<0.9999] = 0
        mask[mask>0] = 1
        
        return output*mask


    def forward(self,x):
        im1 = x[:,:3,:,:]
        im2 = x[:,3:,:,:]
        
        c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))
        c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))
        c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))
        c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))
        c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))
        c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))
        c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))
        c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))
        c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))
        c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))
        c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))
        c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))


        corr6 = self.corr(c16, c26) 
        corr6 = self.leakyRELU(corr6)   


        x = torch.cat((self.conv6_0(corr6), corr6),1)
        x = torch.cat((self.conv6_1(x), x),1)
        x = torch.cat((self.conv6_2(x), x),1)
        x = torch.cat((self.conv6_3(x), x),1)
        x = torch.cat((self.conv6_4(x), x),1)
        flow6 = self.predict_flow6(x)
        up_flow6 = self.deconv6(flow6)
        up_feat6 = self.upfeat6(x)

        
        warp5 = self.warp(c25, up_flow6*0.625)
        corr5 = self.corr(c15, warp5) 
        corr5 = self.leakyRELU(corr5)
        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
        x = torch.cat((self.conv5_0(x), x),1)
        x = torch.cat((self.conv5_1(x), x),1)
        x = torch.cat((self.conv5_2(x), x),1)
        x = torch.cat((self.conv5_3(x), x),1)
        x = torch.cat((self.conv5_4(x), x),1)
        flow5 = self.predict_flow5(x)
        up_flow5 = self.deconv5(flow5)
        up_feat5 = self.upfeat5(x)

       
        warp4 = self.warp(c24, up_flow5*1.25)
        corr4 = self.corr(c14, warp4)  
        corr4 = self.leakyRELU(corr4)
        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
        x = torch.cat((self.conv4_0(x), x),1)
        x = torch.cat((self.conv4_1(x), x),1)
        x = torch.cat((self.conv4_2(x), x),1)
        x = torch.cat((self.conv4_3(x), x),1)
        x = torch.cat((self.conv4_4(x), x),1)
        flow4 = self.predict_flow4(x)
        up_flow4 = self.deconv4(flow4)
        up_feat4 = self.upfeat4(x)


        warp3 = self.warp(c23, up_flow4*2.5)
        corr3 = self.corr(c13, warp3) 
        corr3 = self.leakyRELU(corr3)
        

        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
        x = torch.cat((self.conv3_0(x), x),1)
        x = torch.cat((self.conv3_1(x), x),1)
        x = torch.cat((self.conv3_2(x), x),1)
        x = torch.cat((self.conv3_3(x), x),1)
        x = torch.cat((self.conv3_4(x), x),1)
        flow3 = self.predict_flow3(x)
        up_flow3 = self.deconv3(flow3)
        up_feat3 = self.upfeat3(x)


        warp2 = self.warp(c22, up_flow3*5.0) 
        corr2 = self.corr(c12, warp2)
        corr2 = self.leakyRELU(corr2)
        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
        x = torch.cat((self.conv2_0(x), x),1)
        x = torch.cat((self.conv2_1(x), x),1)
        x = torch.cat((self.conv2_2(x), x),1)
        x = torch.cat((self.conv2_3(x), x),1)
        x = torch.cat((self.conv2_4(x), x),1)
        flow2 = self.predict_flow2(x)
 
        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
        
        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2


class PWCDCNet_old(nn.Module):
    """
    PWC-DC net. add dilation convolution and densenet connections

    """
    def __init__(self, md=4):
        """
        input: md --- maximum displacement (for correlation. default: 4), after warpping

        """
        super(PWCDCNet_old,self).__init__()

        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)
        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)
        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)
        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)
        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)
        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)
        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)
        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)
        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)
        self.conv5b  = conv(128,128, kernel_size=3, stride=1)
        self.conv6a  = conv(128,196, kernel_size=3, stride=2)
        self.conv6b  = conv(196,196, kernel_size=3, stride=1)

        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)
        self.leakyRELU = nn.LeakyReLU(0.1)
        
        nd = (2*md+1)**2
        dd = np.cumsum([128,128,96,64,32])

        od = nd
        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        
        self.predict_flow6 = predict_flow(od+dd[4])
        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+128+4
        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow5 = predict_flow(od+dd[4]) 
        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+96+4
        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow4 = predict_flow(od+dd[4]) 
        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+64+4
        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow3 = predict_flow(od+dd[4]) 
        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) 
        
        od = nd+32+4
        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)
        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)
        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)
        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)
        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)
        self.predict_flow2 = predict_flow(od+dd[4]) 
        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) 
        
        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)
        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)
        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)
        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)
        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)
        self.dc_conv7 = predict_flow(32)

        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal(m.weight.data, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()


    def warp(self, x, flo):
        """
        warp an image/tensor (im2) back to im1, according to the optical flow

        x: [B, C, H, W] (im2)
        flo: [B, 2, H, W] flow

        """
        B, C, H, W = x.size()
        # mesh grid 
        xx = torch.arange(0, W).view(1,-1).repeat(H,1)
        yy = torch.arange(0, H).view(-1,1).repeat(1,W)
        xx = xx.view(1,1,H,W).repeat(B,1,1,1)
        yy = yy.view(1,1,H,W).repeat(B,1,1,1)
        grid = torch.cat((xx,yy),1).float()

        if x.is_cuda:
            grid = grid.cuda()
        vgrid = Variable(grid) + flo

        # scale grid to [-1,1] 
        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0
        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0

        vgrid = vgrid.permute(0,2,3,1)        
        output = nn.functional.grid_sample(x, vgrid)
        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
        mask = nn.functional.grid_sample(mask, vgrid)
        
        mask[mask<0.999] = 0
        mask[mask>0] = 1
        
        return output*mask


    def forward(self,x):
        im1 = x[:,:3,:,:]
        im2 = x[:,3:,:,:]
        
        c11 = self.conv1b(self.conv1a(im1))
        c21 = self.conv1b(self.conv1a(im2))
        c12 = self.conv2b(self.conv2a(c11))
        c22 = self.conv2b(self.conv2a(c21))
        c13 = self.conv3b(self.conv3a(c12))
        c23 = self.conv3b(self.conv3a(c22))
        c14 = self.conv4b(self.conv4a(c13))
        c24 = self.conv4b(self.conv4a(c23))        
        c15 = self.conv5b(self.conv5a(c14))
        c25 = self.conv5b(self.conv5a(c24))
        c16 = self.conv6b(self.conv6a(c15))
        c26 = self.conv6b(self.conv6a(c25))
        
        corr6 = self.corr(c16, c26) 
        corr6 = self.leakyRELU(corr6)        
        x = torch.cat((corr6, self.conv6_0(corr6)),1)
        x = torch.cat((self.conv6_1(x), x),1)
        x = torch.cat((x, self.conv6_2(x)),1)
        x = torch.cat((x, self.conv6_3(x)),1)
        x = torch.cat((x, self.conv6_4(x)),1)
        flow6 = self.predict_flow6(x)
        up_flow6 = self.deconv6(flow6)
        up_feat6 = self.upfeat6(x)
        
        warp5 = self.warp(c25, up_flow6*0.625)
        corr5 = self.corr(c15, warp5) 
        corr5 = self.leakyRELU(corr5)
        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)
        x = torch.cat((x, self.conv5_0(x)),1)
        x = torch.cat((self.conv5_1(x), x),1)
        x = torch.cat((x, self.conv5_2(x)),1)
        x = torch.cat((x, self.conv5_3(x)),1)
        x = torch.cat((x, self.conv5_4(x)),1)
        flow5 = self.predict_flow5(x)
        up_flow5 = self.deconv5(flow5)
        up_feat5 = self.upfeat5(x)
        
        warp4 = self.warp(c24, up_flow5*1.25)
        corr4 = self.corr(c14, warp4)  
        corr4 = self.leakyRELU(corr4)
        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)
        x = torch.cat((x, self.conv4_0(x)),1)
        x = torch.cat((self.conv4_1(x), x),1)
        x = torch.cat((x, self.conv4_2(x)),1)
        x = torch.cat((x, self.conv4_3(x)),1)
        x = torch.cat((x, self.conv4_4(x)),1)
        flow4 = self.predict_flow4(x)
        up_flow4 = self.deconv4(flow4)
        up_feat4 = self.upfeat4(x)

        warp3 = self.warp(c23, up_flow4*2.5)
        corr3 = self.corr(c13, warp3) 
        corr3 = self.leakyRELU(corr3)
        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)
        x = torch.cat((x, self.conv3_0(x)),1)
        x = torch.cat((self.conv3_1(x), x),1)
        x = torch.cat((x, self.conv3_2(x)),1)
        x = torch.cat((x, self.conv3_3(x)),1)
        x = torch.cat((x, self.conv3_4(x)),1)
        flow3 = self.predict_flow3(x)
        up_flow3 = self.deconv3(flow3)
        up_feat3 = self.upfeat3(x)
        
        warp2 = self.warp(c22, up_flow3*5.0) 
        corr2 = self.corr(c12, warp2)
        corr2 = self.leakyRELU(corr2)
        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)
        x = torch.cat((x, self.conv2_0(x)),1)
        x = torch.cat((self.conv2_1(x), x),1)
        x = torch.cat((x, self.conv2_2(x)),1)
        x = torch.cat((x, self.conv2_3(x)),1)
        x = torch.cat((x, self.conv2_4(x)),1)
        flow2 = self.predict_flow2(x)
 
        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))
        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))
        
        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2


def pwc_dc_net(path=None):

    model = PWCDCNet()
    if path is not None:
        data = torch.load(path)
        if 'state_dict' in data.keys():
            model.load_state_dict(data['state_dict'])
        else:
            model.load_state_dict(data)
    return model


def pwc_dc_net_old(path=None):

    model = PWCDCNet_old()
    if path is not None:
        data = torch.load(path)
        if 'state_dict' in data.keys():
            model.load_state_dict(data['state_dict'])
        else:
            model.load_state_dict(data)
    return model


================================================
FILE: PWCNet/models/__init__.py
================================================
from .PWCNet import *


================================================
FILE: README.md
================================================
# DAIN (Depth-Aware Video Frame Interpolation)
[Project](https://sites.google.com/view/wenbobao/dain) **|** [Paper](http://arxiv.org/abs/1904.00830)

[Wenbo Bao](https://sites.google.com/view/wenbobao/home),
[Wei-Sheng Lai](http://graduatestudents.ucmerced.edu/wlai24/), 
[Chao Ma](https://sites.google.com/site/chaoma99/),
Xiaoyun Zhang, 
Zhiyong Gao, 
and [Ming-Hsuan Yang](http://faculty.ucmerced.edu/mhyang/)

IEEE Conference on Computer Vision and Pattern Recognition, Long Beach, CVPR 2019

This work is developed based on our TPAMI work [MEMC-Net](https://github.com/baowenbo/MEMC-Net), where we propose the adaptive warping layer. Please also consider referring to it.

### Table of Contents
1. [Introduction](#introduction)
1. [Citation](#citation)
1. [Requirements and Dependencies](#requirements-and-dependencies)
1. [Installation](#installation)
1. [Testing Pre-trained Models](#testing-pre-trained-models)
1. [Downloading Results](#downloading-results)
1. [Slow-motion Generation](#slow-motion-generation)
1. [Training New Models](#training-new-models)
1. [Google Colab Demo](#google-colab-demo)

### Introduction
We propose the **D**epth-**A**ware video frame **IN**terpolation (**DAIN**) model to explicitly detect the occlusion by exploring the depth cue.
We develop a depth-aware flow projection layer to synthesize intermediate flows that preferably sample closer objects than farther ones.
Our method achieves state-of-the-art performance on the Middlebury dataset. 
We provide videos [here](https://www.youtube.com/watch?v=-f8f0igQi5I&t=5s).

<!--![teaser](http://vllab.ucmerced.edu/wlai24/LapSRN/images/emma_text.gif)-->

<!--[![teaser](https://img.youtube.com/vi/icJ0WbPsE20/0.jpg)](https://www.youtube.com/watch?v=icJ0WbPsE20&feature=youtu.be)
<!--<iframe width="560" height="315" src="https://www.youtube.com/embed/icJ0WbPsE20" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
![teaser](http://vllab1.ucmerced.edu/~wenbobao/DAIN/kart-turn_compare.gif)


<!--哈哈我是注释，不会在浏览器中显示。
Beanbags
https://drive.google.com/open?id=170vdxANGoNKO5_8MYOuiDvoIXzucv7HW
Dimentrodon
https://drive.google.com/open?id=14n7xvb9hjTKqfcr7ZpEFyfMvx6E8NhD_
DogDance
https://drive.google.com/open?id=1YWAyAJ3T48fMFv2K8j8wIVcmQm39cRof
Grove2
https://drive.google.com/open?id=1sJLwdQdL6JYXSQo_Bev0aQMleWacxCsN
Grove3
https://drive.google.com/open?id=1jGj3UdGppoJO02Of8ZaNXqDH4fnXuQ8O
Hydrangea
https://drive.google.com/open?id=1_4kVlhvrmCv54aXi7vZMk3-FtRQF7s0s
MiniCooper
https://drive.google.com/open?id=1pWHtyBSZsOTC7NTVdHTrv1W-dxa95BLo
RubberWhale
https://drive.google.com/open?id=1korbXsGpSgJn7THBHkLRVrJMtCt5YZPB
Urban2
https://drive.google.com/open?id=1v57RMm9x5vM36mCgPy5hresXDZWtw3Vs
Urban3
https://drive.google.com/open?id=1LMwSU0PrG4_GaDjWRI2v9hvWpYwzRKca
Venus
https://drive.google.com/open?id=1piPnEexuHaiAr4ZzWSAxGi1u1Xo_6vPp
Walking
https://drive.google.com/open?id=1CgCLmVC_WTVTAcA_IdWbLqR8MS18zHoa
-->

<p float="middle">
<img src="https://drive.google.com/uc?export=view&id=1YWAyAJ3T48fMFv2K8j8wIVcmQm39cRof" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1CgCLmVC_WTVTAcA_IdWbLqR8MS18zHoa" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1pWHtyBSZsOTC7NTVdHTrv1W-dxa95BLo" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=170vdxANGoNKO5_8MYOuiDvoIXzucv7HW" width="200"/>
</p>

<p float="middle">
<img src="https://drive.google.com/uc?export=view&id=1sJLwdQdL6JYXSQo_Bev0aQMleWacxCsN" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1jGj3UdGppoJO02Of8ZaNXqDH4fnXuQ8O" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1v57RMm9x5vM36mCgPy5hresXDZWtw3Vs" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1LMwSU0PrG4_GaDjWRI2v9hvWpYwzRKca" width="200"/>
</p>

<p float="middle">
<img src="https://drive.google.com/uc?export=view&id=1piPnEexuHaiAr4ZzWSAxGi1u1Xo_6vPp" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1korbXsGpSgJn7THBHkLRVrJMtCt5YZPB" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=1_4kVlhvrmCv54aXi7vZMk3-FtRQF7s0s" width="200"/>
<img src="https://drive.google.com/uc?export=view&id=14n7xvb9hjTKqfcr7ZpEFyfMvx6E8NhD_" width="200"/>
</p>

### Citation
If you find the code and datasets useful in your research, please cite:

    @inproceedings{DAIN,
        author    = {Bao, Wenbo and Lai, Wei-Sheng and Ma, Chao and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan}, 
        title     = {Depth-Aware Video Frame Interpolation}, 
        booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
        year      = {2019}
    }
    @article{MEMC-Net,
         title={MEMC-Net: Motion Estimation and Motion Compensation Driven Neural Network for Video Interpolation and Enhancement},
         author={Bao, Wenbo and Lai, Wei-Sheng, and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan},
         journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
         doi={10.1109/TPAMI.2019.2941941},
         year={2018}
    }

### Requirements and Dependencies
- Ubuntu (We test with Ubuntu = 16.04.5 LTS)
- Python (We test with Python = 3.6.8 in Anaconda3 = 4.1.1)
- Cuda & Cudnn (We test with Cuda = 9.0 and Cudnn = 7.0)
- PyTorch (The customized depth-aware flow projection and other layers require ATen API in PyTorch = 1.0.0)
- GCC (Compiling PyTorch 1.0.0 extension files (.c/.cu) requires gcc = 4.9.1 and nvcc = 9.0 compilers)
- NVIDIA GPU (We use Titan X (Pascal) with compute = 6.1, but we support compute_50/52/60/61 devices, should you have devices with higher compute capability, please revise [this](https://github.com/baowenbo/DAIN/blob/master/my_package/DepthFlowProjection/setup.py))

### Installation
Download repository:

    $ git clone https://github.com/baowenbo/DAIN.git

Before building Pytorch extensions, be sure you have `pytorch >= 1.0.0`:
    
    $ python -c "import torch; print(torch.__version__)"
    
Generate our PyTorch extensions:
    
    $ cd DAIN
    $ cd my_package 
    $ ./build.sh

Generate the Correlation package required by [PWCNet](https://github.com/NVlabs/PWC-Net/tree/master/PyTorch/external_packages/correlation-pytorch-master):
    
    $ cd ../PWCNet/correlation_package_pytorch1_0
    $ ./build.sh


### Testing Pre-trained Models
Make model weights dir and Middlebury dataset dir:

    $ cd DAIN
    $ mkdir model_weights
    $ mkdir MiddleBurySet
    
Download pretrained models, 

    $ cd model_weights
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth
    
and Middlebury dataset:
    
    $ cd ../MiddleBurySet
    $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-color-allframes.zip
    $ unzip other-color-allframes.zip
    $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-gt-interp.zip
    $ unzip other-gt-interp.zip
    $ cd ..

preinstallations:

    $ cd PWCNet/correlation_package_pytorch1_0
    $ sh build.sh
    $ cd ../my_package
    $ sh build.sh
    $ cd ..

We are good to go by:

    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py

The interpolated results are under `MiddleBurySet/other-result-author/[random number]/`, where the `random number` is used to distinguish different runnings. 

### Downloading Results
Our DAIN model achieves the state-of-the-art performance on the UCF101, Vimeo90K, and Middlebury ([*eval*](http://vision.middlebury.edu/flow/eval/results/results-n1.php) and *other*).
Download our interpolated results with:
    
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/UCF101_DAIN.zip
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Vimeo90K_interp_DAIN.zip
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_eval_DAIN.zip
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_other_DAIN.zip
    
    
### Slow-motion Generation
Our model is fully capable of generating slow-motion effect with minor modification on the network architecture.
Run the following code by specifying `time_step = 0.25` to generate x4 slow-motion effect:

    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.25

or set `time_step` to `0.125` or `0.1` as follows 

    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.125
    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.1
to generate x8 and x10 slow-motion respectively. Or if you would like to have x100 slow-motion for a little fun.
    
    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.01

You may also want to create gif animations by:
    
    $ cd MiddleBurySet/other-result-author/[random number]/Beanbags
    $ convert -delay 1 *.png -loop 0 Beanbags.gif //1*10ms delay 

Have fun and enjoy yourself! 


### Training New Models
Download the Vimeo90K triplet dataset for video frame interpolation task, also see [here](https://github.com/anchen1011/toflow/blob/master/download_dataset.sh) by [Xue et al., IJCV19](https://arxiv.org/abs/1711.09078).
    
    $ cd DAIN
    $ mkdir /path/to/your/dataset & cd /path/to/your/dataset 
    $ wget http://data.csail.mit.edu/tofu/dataset/vimeo_triplet.zip
    $ unzip vimeo_triplet.zip
    $ rm vimeo_triplet.zip

Download the pretrained MegaDepth and PWCNet models
    
    $ cd MegaDepth/checkpoints/test_local
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best_generalization_net_G.pth
    $ cd ../../../PWCNet
    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/pwc_net.pth.tar
    $ cd  ..
    
Run the training script:

    $ CUDA_VISIBLE_DEVICES=0 python train.py --datasetPath /path/to/your/dataset --batch_size 1 --save_which 1 --lr 0.0005 --rectify_lr 0.0005 --flow_lr_coe 0.01 --occ_lr_coe 0.0 --filter_lr_coe 1.0 --ctx_lr_coe 1.0 --alpha 0.0 1.0 --patience 4 --factor 0.2
    
The optimized models will be saved to the `model_weights/[random number]` directory, where [random number] is generated for different runs.

Replace the pre-trained `model_weights/best.pth` model with the newly trained `model_weights/[random number]/best.pth` model.
Then test the new model by executing: 

    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py

### Google Colab Demo
This is a modification of DAIN that allows the usage of Google Colab and is able to do a full demo interpolation from a source video to a target video.

Original Notebook File by btahir can be found [here](https://github.com/baowenbo/DAIN/issues/44).

To use the Colab, follow these steps:

- Download the `Colab_DAIN.ipynb` file ([link](https://raw.githubusercontent.com/baowenbo/DAIN/master/Colab_DAIN.ipynb)).
- Visit Google Colaboratory ([link](https://colab.research.google.com/))
- Select the "Upload" option, and upload the `.ipynb` file
- Start running the cells one by one, following the instructions.

Colab file authors: [Styler00Dollar](https://github.com/styler00dollar) and [Alpha](https://github.com/AlphaGit).

### Contact
[Wenbo Bao](mailto:bwb0813@gmail.com); [Wei-Sheng (Jason) Lai](mailto:phoenix104104@gmail.com)

### License
See [MIT License](https://github.com/baowenbo/DAIN/blob/master/LICENSE)


================================================
FILE: Resblock/BasicBlock.py
================================================
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch.nn.init as weight_init
import torch
__all__ = ['MultipleBasicBlock','MultipleBasicBlock_4']
def conv3x3(in_planes, out_planes, dilation = 1, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes,dilation, stride)
        # self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                # weight_init.xavier_normal()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        # out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        # out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
class MultipleBasicBlock(nn.Module):

    def __init__(self,input_feature,
                 block, num_blocks,
                 intermediate_feature = 64, dense = True):
        super(MultipleBasicBlock, self).__init__()
        self.dense = dense
        self.num_block = num_blocks
        self.intermediate_feature = intermediate_feature

        self.block1= nn.Sequential(*[
            nn.Conv2d(input_feature, intermediate_feature,
                      kernel_size=7, stride=1, padding=3, bias=True),
            nn.ReLU(inplace=True)
        ])

        # for i in range(1, num_blocks):
        self.block2 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=2 else None
        self.block3 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=3 else None
        self.block4 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=4 else None
        self.block5 = nn.Sequential(*[nn.Conv2d(intermediate_feature, 3 , (3, 3), 1, (1, 1))])

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x) if self.num_block>=2 else x
        x = self.block3(x) if self.num_block>=3 else x
        x = self.block4(x) if self.num_block== 4 else x
        x = self.block5(x)
        return x

def MultipleBasicBlock_4(input_feature,intermediate_feature = 64):
    model = MultipleBasicBlock(input_feature,
                               BasicBlock,4 ,
                               intermediate_feature)
    return model


if __name__ == '__main__':

    # x= Variable(torch.randn(2,3,224,448))
    # model =    S2DF(BasicBlock,3,True)
    # y = model(x)
    model = MultipleBasicBlock(200, BasicBlock,4)
    model = BasicBlock(64,64,1)
    # y = model(x)
    exit(0)

================================================
FILE: Resblock/__init__.py
================================================
from   .BasicBlock import *

================================================
FILE: S2D_models/S2DF.py
================================================
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

import torch
# __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           # 'resnet152','resnet18_conv1']
__all__ = ['S2DF','S2DF_3dense','S2DF_3dense_nodilation',
           'S2DF_3last','S2DF_2dense', 'BasicBlock']

model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, dilation = 1, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes,dilation, stride)
        # self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        # out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        # out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        # self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=int(dilation*(3-1)/2), dilation = dilation, bias=False)
        # self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        # self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        # out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        # out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        # out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class S2DF(nn.Module):

    def __init__(self, block, num_blocks,dense = True,dilation=True):
        self.inplanes = 64
        super(S2DF, self).__init__()
        self.dense = dense
        self.num_block = num_blocks
        assert(num_blocks>=1 and num_blocks<=4)
        self.block1 = nn.Sequential(*[
            nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),
            nn.ReLU(inplace=True)
        ])

        self.dilation = dilation
        # for i in range(1, num_blocks):
        self.block2 = block(self.inplanes, 64, dilation = 4 if dilation else 1) if num_blocks>=2 else None
        self.block3 = block(self.inplanes, 64, dilation = 8 if dilation else 1) if num_blocks>=3 else None
        self.block4 = block(self.inplanes, 64, dilation = 16 if dilation else 1) if num_blocks>=4 else None

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        y = []

        y.append(x) #raw feature
        x = self.block1(x)
        if (self.num_block > 1 and self.dense) or self.num_block == 1:
            y.append(x)

        x = self.block2(x) if self.num_block>=2 else x
        if (self.num_block > 2 and self.dense) or self.num_block == 2:
            y.append(x)

        x = self.block3(x) if self.num_block>=3 else x
        if (self.num_block > 3 and self.dense) or self.num_block == 3:
            y.append(x)

        x = self.block4(x) if self.num_block== 4 else x
        if self.num_block == 4 :
            y.append(x)

        return torch.cat(y,dim=1)


class S2DFsim(nn.Module):

    def __init__(self, block, num_blocks,dense = True,dilation=True):
        self.inplanes = 64
        super(S2DFsim, self).__init__()
        self.dense = dense
        self.num_block = num_blocks
        assert(num_blocks>=1 and num_blocks<=4)
        self.block1 = nn.Sequential(*[
            nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
        ])

        self.dilation = dilation
        # for i in range(1, num_blocks):
        self.block2 = nn.Sequential(*[
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
        ]) if num_blocks >= 2 else None
        self.block3 =  nn.Sequential(*[
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
        ]) if num_blocks >= 3 else None
        self.block4 = nn.Sequential(*[
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
        ]) if num_blocks >= 4 else None

        # for m in self.modules():
        #     if isinstance(m, nn.Conv2d):
        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        #         m.weight.data.normal_(0, math.sqrt(2. / n))
        #     elif isinstance(m, nn.BatchNorm2d):
        #         m.weight.data.fill_(1)
        #         m.bias.data.zero_()

    def forward(self, x):
        y = []

        y.append(x) #raw feature
        x = self.block1(x)
        if (self.num_block > 1 and self.dense) or self.num_block == 1:
            y.append(x)

        x = self.block2(x) if self.num_block>=2 else x
        if (self.num_block > 2 and self.dense) or self.num_block == 2:
            y.append(x)

        x = self.block3(x) if self.num_block>=3 else x
        if (self.num_block > 3 and self.dense) or self.num_block == 3:
            y.append(x)

        x = self.block4(x) if self.num_block== 4 else x
        if self.num_block == 4 :
            y.append(x)

        return torch.cat(y,dim=1)
def S2DF_3dense_nodilation():
    model = S2DFsim(None,3,dense=True,dilation=False)
    return model
def S2DF_3dense():
    model = S2DF(BasicBlock,3,dense=True)
    return model
def S2DF_3last():
    model = S2DF(BasicBlock,3,dense=False)
    return model
def S2DF_2dense():
    model = S2DF(BasicBlock,2,dense=True)
    return model


from torch.autograd import Variable

if __name__ == '__main__':

    x= Variable(torch.randn(2,3,224,448))
    # model =    S2DF(BasicBlock,3,True)
    # y = model(x)

    model = S2DF(BasicBlock,4,False)
    y = model(x)
    exit(0)


================================================
FILE: S2D_models/__init__.py
================================================
from .S2DF import *

================================================
FILE: Stack.py
================================================

class Stack:
    def __init__(self):
        self.stack = []
    def pop(self):
        if self.is_empty():
            return None
        else:
            return self.stack.pop()
    def push(self,val):
        return self.stack.append(val)
    def peak(self):
        if self.is_empty():
            return None
        else:
            return self.stack[-1]
    def size(self):
        return len(self.stack)
    def is_empty(self):
        return self.size() == 0

================================================
FILE: balancedsampler.py
================================================
from torch.utils.data.sampler import Sampler
import torch

class RandomBalancedSampler(Sampler):
    """Samples elements randomly, with an arbitrary size, independant from dataset length.
    this is a balanced sampling that will sample the whole dataset with a random permutation.

    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source, epoch_size):
        self.data_size = len(data_source)
        self.epoch_size = epoch_size
        self.index = 0

    def __next__(self):
        if self.index == 0:
            #re-shuffle the sampler
            self.indices = torch.randperm(self.data_size)
        self.index = (self.index+1)%self.data_size
        return self.indices[self.index]

    def next(self):
        return self.__next__()

    def __iter__(self):
        return self

    def __len__(self):
        return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size

class SequentialBalancedSampler(Sampler):
    """Samples elements dequentially, with an arbitrary size, independant from dataset length.
    this is a balanced sampling that will sample the whole dataset before resetting it.

    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, data_source, epoch_size):
        self.data_size = len(data_source)
        self.epoch_size = epoch_size
        self.index = 0

    def __next__(self):
        self.index = (self.index+1)%self.data_size
        return self.index

    def next(self):
        return self.__next__()

    def __iter__(self):
        return self

    def __len__(self):
        return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size


================================================
FILE: colab_interpolate.py
================================================
import time
import os
from torch.autograd import Variable
import torch
import numpy as np
import numpy
import networks
from my_args import args
from imageio import imread, imsave
from AverageMeter import  *
import shutil
import datetime
torch.backends.cudnn.benchmark = True

model = networks.__dict__[args.netName](
                                    channel = args.channels,
                                    filter_size = args.filter_size,
                                    timestep = args.time_step,
                                    training = False)

if args.use_cuda:
    model = model.cuda()

model_path = './model_weights/best.pth'
if not os.path.exists(model_path):
    print("*****************************************************************")
    print("**** We couldn't load any trained weights ***********************")
    print("*****************************************************************")
    exit(1)

if args.use_cuda:
    pretrained_dict = torch.load(model_path)
else:
    pretrained_dict = torch.load(model_path, map_location=lambda storage, loc: storage)

model_dict = model.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
# 3. load the new state dict
model.load_state_dict(model_dict)
# 4. release the pretrained dict for saving memory
pretrained_dict = []

model = model.eval() # deploy mode

frames_dir = args.frame_input_dir
output_dir = args.frame_output_dir

timestep = args.time_step
time_offsets = [kk * timestep for kk in range(1, int(1.0 / timestep))]

input_frame = args.start_frame - 1
loop_timer = AverageMeter()

final_frame = args.end_frame

torch.set_grad_enabled(False)

# we want to have input_frame between (start_frame-1) and (end_frame-2)
# this is because at each step we read (frame) and (frame+1)
# so the last iteration will actuall be (end_frame-1) and (end_frame)
while input_frame < final_frame - 1:
    input_frame += 1

    start_time = time.time()

    filename_frame_1 = os.path.join(frames_dir, f'{input_frame:0>5d}.png')
    filename_frame_2 = os.path.join(frames_dir, f'{input_frame+1:0>5d}.png')

    X0 = torch.from_numpy(np.transpose(imread(filename_frame_1), (2,0,1)).astype("float32") / 255.0).type(args.dtype)
    X1 = torch.from_numpy(np.transpose(imread(filename_frame_2), (2,0,1)).astype("float32") / 255.0).type(args.dtype)

    assert (X0.size(1) == X1.size(1))
    assert (X0.size(2) == X1.size(2))

    intWidth = X0.size(2)
    intHeight = X0.size(1)
    channels = X0.size(0)
    if not channels == 3:
        print(f"Skipping {filename_frame_1}-{filename_frame_2} -- expected 3 color channels but found {channels}.")
        continue

    if intWidth != ((intWidth >> 7) << 7):
        intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary
        intPaddingLeft = int((intWidth_pad - intWidth) / 2)
        intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
    else:
        intPaddingLeft = 32
        intPaddingRight= 32

    if intHeight != ((intHeight >> 7) << 7):
        intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary
        intPaddingTop = int((intHeight_pad - intHeight) / 2)
        intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
    else:
        intPaddingTop = 32
        intPaddingBottom = 32

    pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight, intPaddingTop, intPaddingBottom])

    X0 = Variable(torch.unsqueeze(X0,0))
    X1 = Variable(torch.unsqueeze(X1,0))
    X0 = pader(X0)
    X1 = pader(X1)

    if args.use_cuda:
        X0 = X0.cuda()
        X1 = X1.cuda()

    y_s, offset, filter = model(torch.stack((X0, X1),dim = 0))
    y_ = y_s[args.save_which]

    if args.use_cuda:
        X0 = X0.data.cpu().numpy()
        if not isinstance(y_, list):
            y_ = y_.data.cpu().numpy()
        else:
            y_ = [item.data.cpu().numpy() for item in y_]
        offset = [offset_i.data.cpu().numpy() for offset_i in offset]
        filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None
        X1 = X1.data.cpu().numpy()
    else:
        X0 = X0.data.numpy()
        if not isinstance(y_, list):
            y_ = y_.data.numpy()
        else:
            y_ = [item.data.numpy() for item in y_]
        offset = [offset_i.data.numpy() for offset_i in offset]
        filter = [filter_i.data.numpy() for filter_i in filter]
        X1 = X1.data.numpy()

    X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
    y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,
                                intPaddingLeft:intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]
    offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
    filter = [np.transpose(
        filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
        (1, 2, 0)) for filter_i in filter]  if filter is not None else None
    X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))

    interpolated_frame_number = 0
    shutil.copy(filename_frame_1, os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png"))
    for item, time_offset in zip(y_, time_offsets):
        interpolated_frame_number += 1
        output_frame_file_path = os.path.join(output_dir, f"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png")
        imsave(output_frame_file_path, np.round(item).astype(numpy.uint8))

    end_time = time.time()
    loop_timer.update(end_time - start_time)

    frames_left = final_frame - input_frame
    estimated_seconds_left = frames_left * loop_timer.avg
    estimated_time_left = datetime.timedelta(seconds=estimated_seconds_left)
    print(f"****** Processed frame {input_frame} | Time per frame (avg): {loop_timer.avg:2.2f}s | Time left: {estimated_time_left} ******************" )

# Copying last frame
last_frame_filename = os.path.join(frames_dir, str(str(final_frame).zfill(5))+'.png')
shutil.copy(last_frame_filename, os.path.join(output_dir, f"{final_frame:0>5d}{0:0>3d}.png"))

print("Finished processing images.")


================================================
FILE: datasets/Vimeo_90K_interp.py
================================================
import os.path
import random
# import glob
import math
from .listdatasets import ListDataset,Vimeo_90K_loader


def make_dataset(root, list_file):
    raw_im_list = open(os.path.join(root, list_file)).read().splitlines()
    # the last line is invalid in test set.
    # print("The last sample is : " + raw_im_list[-1])
    raw_im_list = raw_im_list[:-1]
    assert len(raw_im_list) > 0
    random.shuffle(raw_im_list)

    return  raw_im_list

def Vimeo_90K_interp(root, split=1.0, single=False, task = 'interp' ):
    train_list = make_dataset(root,"tri_trainlist.txt")
    test_list = make_dataset(root,"tri_testlist.txt")
    train_dataset = ListDataset(root, train_list, loader=Vimeo_90K_loader)
    test_dataset = ListDataset(root, test_list, loader=Vimeo_90K_loader)
    return train_dataset, test_dataset

================================================
FILE: datasets/__init__.py
================================================
from .Vimeo_90K_interp import Vimeo_90K_interp

__all__ = (
           'Vimeo_90K_interp',
)

# Vimeo_90K = "/tmp4/wenbobao_data/vimeo_triplet"


================================================
FILE: datasets/listdatasets.py
================================================
import torch.utils.data as data
import os
import os.path
from scipy.ndimage import imread
import numpy as np
import random

def Vimeo_90K_loader(root, im_path, input_frame_size = (3, 256, 448), output_frame_size = (3, 256, 448), data_aug = True):


    root = os.path.join(root,'sequences',im_path)

    if data_aug and random.randint(0, 1):
        path_pre2 = os.path.join(root,  "im1.png")
        path_mid = os.path.join(root,  "im2.png")
        path_pre1 = os.path.join(root,  "im3.png")
    else:
        path_pre1 = os.path.join(root,  "im1.png")
        path_mid = os.path.join(root,  "im2.png")
        path_pre2 = os.path.join(root,  "im3.png")

    im_pre2 = imread(path_pre2)
    im_pre1 = imread(path_pre1)
    im_mid = imread(path_mid)

    h_offset = random.choice(range(256 - input_frame_size[1] + 1))
    w_offset = random.choice(range(448 - input_frame_size[2] + 1))

    im_pre2 = im_pre2[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]
    im_pre1 = im_pre1[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]
    im_mid = im_mid[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]

    if data_aug:
        if random.randint(0, 1):
            im_pre2 = np.fliplr(im_pre2)
            im_mid = np.fliplr(im_mid)
            im_pre1 = np.fliplr(im_pre1)
        if random.randint(0, 1):
            im_pre2 = np.flipud(im_pre2)
            im_mid = np.flipud(im_mid)
            im_pre1 = np.flipud(im_pre1)

    X0 = np.transpose(im_pre1,(2,0,1))
    X2 = np.transpose(im_pre2, (2, 0, 1))

    y = np.transpose(im_mid, (2, 0, 1))
    return X0.astype("float32")/ 255.0, \
            X2.astype("float32")/ 255.0,\
            y.astype("float32")/ 255.0


class ListDataset(data.Dataset):
    def __init__(self, root, path_list,  loader=Vimeo_90K_loader):

        self.root = root
        self.path_list = path_list
        self.loader = loader

    def __getitem__(self, index):
        path = self.path_list[index]
        # print(path)
        image_0,image_2,image_1 = self.loader(self.root, path)
        return image_0,image_2,image_1

    def __len__(self):
        return len(self.path_list)


================================================
FILE: demo_MiddleBury.py
================================================
import time
import os
from torch.autograd import Variable
import math
import torch

import random
import numpy as np
import numpy
import networks
from my_args import  args

from scipy.misc import imread, imsave
from AverageMeter import  *

torch.backends.cudnn.benchmark = True # to speed up the


DO_MiddleBurryOther = True
MB_Other_DATA = "./MiddleBurySet/other-data/"
MB_Other_RESULT = "./MiddleBurySet/other-result-author/"
MB_Other_GT = "./MiddleBurySet/other-gt-interp/"
if not os.path.exists(MB_Other_RESULT):
    os.mkdir(MB_Other_RESULT)


model = networks.__dict__[args.netName](channel=args.channels,
                            filter_size = args.filter_size ,
                            timestep=args.time_step,
                            training=False)

if args.use_cuda:
    model = model.cuda()

args.SAVED_MODEL = './model_weights/best.pth'
if os.path.exists(args.SAVED_MODEL):
    print("The testing model weight is: " + args.SAVED_MODEL)
    if not args.use_cuda:
        pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)
        # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))
    else:
        pretrained_dict = torch.load(args.SAVED_MODEL)
        # model.load_state_dict(torch.load(args.SAVED_MODEL))

    model_dict = model.state_dict()
    # 1. filter out unnecessary keys
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_dict)
    # 3. load the new state dict
    model.load_state_dict(model_dict)
    # 4. release the pretrained dict for saving memory
    pretrained_dict = []
else:
    print("*****************************************************************")
    print("**** We don't load any trained weights **************************")
    print("*****************************************************************")

model = model.eval() # deploy mode


use_cuda=args.use_cuda
save_which=args.save_which
dtype = args.dtype
unique_id =str(random.randint(0, 100000))
print("The unique id for current testing is: " + str(unique_id))

interp_error = AverageMeter()
if DO_MiddleBurryOther:
    subdir = os.listdir(MB_Other_DATA)
    gen_dir = os.path.join(MB_Other_RESULT, unique_id)
    os.mkdir(gen_dir)

    tot_timer = AverageMeter()
    proc_timer = AverageMeter()
    end = time.time()
    for dir in subdir:
        print(dir)
        os.mkdir(os.path.join(gen_dir, dir))
        arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png")
        arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png")
        arguments_strOut = os.path.join(gen_dir, dir, "frame10i11.png")
        gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png")

        X0 =  torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
        X1 =  torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype)


        y_ = torch.FloatTensor()

        assert (X0.size(1) == X1.size(1))
        assert (X0.size(2) == X1.size(2))

        intWidth = X0.size(2)
        intHeight = X0.size(1)
        channel = X0.size(0)
        if not channel == 3:
            continue

        if intWidth != ((intWidth >> 7) << 7):
            intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary
            intPaddingLeft =int(( intWidth_pad - intWidth)/2)
            intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
        else:
            intWidth_pad = intWidth
            intPaddingLeft = 32
            intPaddingRight= 32

        if intHeight != ((intHeight >> 7) << 7):
            intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary
            intPaddingTop = int((intHeight_pad - intHeight) / 2)
            intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
        else:
            intHeight_pad = intHeight
            intPaddingTop = 32
            intPaddingBottom = 32

        pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])

        torch.set_grad_enabled(False)
        X0 = Variable(torch.unsqueeze(X0,0))
        X1 = Variable(torch.unsqueeze(X1,0))
        X0 = pader(X0)
        X1 = pader(X1)

        if use_cuda:
            X0 = X0.cuda()
            X1 = X1.cuda()
        proc_end = time.time()
        y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))
        y_ = y_s[save_which]

        proc_timer.update(time.time() -proc_end)
        tot_timer.update(time.time() - end)
        end  = time.time()
        print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" )
        if use_cuda:
            X0 = X0.data.cpu().numpy()
            y_ = y_.data.cpu().numpy()
            offset = [offset_i.data.cpu().numpy() for offset_i in offset]
            filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None
            X1 = X1.data.cpu().numpy()
        else:
            X0 = X0.data.numpy()
            y_ = y_.data.numpy()
            offset = [offset_i.data.numpy() for offset_i in offset]
            filter = [filter_i.data.numpy() for filter_i in filter]
            X1 = X1.data.numpy()


        X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
        y_ = np.transpose(255.0 * y_.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
        offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
        filter = [np.transpose(
            filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
            (1, 2, 0)) for filter_i in filter]  if filter is not None else None
        X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))


        imsave(arguments_strOut, np.round(y_).astype(numpy.uint8))


        rec_rgb =  imread(arguments_strOut)
        gt_rgb = imread(gt_path)

        diff_rgb = 128.0 + rec_rgb - gt_rgb
        avg_interp_error_abs = np.mean(np.abs(diff_rgb - 128.0))

        interp_error.update(avg_interp_error_abs, 1)

        mse = numpy.mean((diff_rgb - 128.0) ** 2)

        PIXEL_MAX = 255.0
        psnr = 20 * math.log10(PIXEL_MAX / math.sqrt(mse))

        print("interpolation error / PSNR : " + str(round(avg_interp_error_abs,4)) + " / " + str(round(psnr,4)))
        metrics = "The average interpolation error / PSNR for all images are : " + str(round(interp_error.avg, 4))
        print(metrics)


================================================
FILE: demo_MiddleBury_slowmotion.py
================================================
import time
import os
from torch.autograd import Variable
import torch
import random
import numpy as np
import numpy
import networks
from my_args import  args
from scipy.misc import imread, imsave
from AverageMeter import  *
import shutil

torch.backends.cudnn.benchmark = True # to speed up the

DO_MiddleBurryOther = True
MB_Other_DATA = "./MiddleBurySet/other-data/"
MB_Other_RESULT = "./MiddleBurySet/other-result-author/"
MB_Other_GT = "./MiddleBurySet/other-gt-interp/"
if not os.path.exists(MB_Other_RESULT):
    os.mkdir(MB_Other_RESULT)


model = networks.__dict__[args.netName](    channel=args.channels,
                                    filter_size = args.filter_size ,
                                    timestep=args.time_step,
                                    training=False)

if args.use_cuda:
    model = model.cuda()

args.SAVED_MODEL = './model_weights/best.pth'
if os.path.exists(args.SAVED_MODEL):
    print("The testing model weight is: " + args.SAVED_MODEL)
    if not args.use_cuda:
        pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)
        # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))
    else:
        pretrained_dict = torch.load(args.SAVED_MODEL)
        # model.load_state_dict(torch.load(args.SAVED_MODEL))

    model_dict = model.state_dict()
    # 1. filter out unnecessary keys
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_dict)
    # 3. load the new state dict
    model.load_state_dict(model_dict)
    # 4. release the pretrained dict for saving memory
    pretrained_dict = []
else:
    print("*****************************************************************")
    print("**** We don't load any trained weights **************************")
    print("*****************************************************************")

model = model.eval() # deploy mode

use_cuda=args.use_cuda
save_which=args.save_which
dtype = args.dtype
unique_id =str(random.randint(0, 100000))
print("The unique id for current testing is: " + str(unique_id))

interp_error = AverageMeter()
if DO_MiddleBurryOther:
    subdir = os.listdir(MB_Other_DATA)
    gen_dir = os.path.join(MB_Other_RESULT, unique_id)
    os.mkdir(gen_dir)

    tot_timer = AverageMeter()
    proc_timer = AverageMeter()
    end = time.time()
    for dir in subdir: 
        print(dir)
        os.mkdir(os.path.join(gen_dir, dir))
        arguments_strFirst = os.path.join(MB_Other_DATA, dir, "frame10.png")
        arguments_strSecond = os.path.join(MB_Other_DATA, dir, "frame11.png")
        gt_path = os.path.join(MB_Other_GT, dir, "frame10i11.png")

        X0 =  torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype("float32")/ 255.0).type(dtype)
        X1 =  torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype("float32")/ 255.0).type(dtype)


        y_ = torch.FloatTensor()

        assert (X0.size(1) == X1.size(1))
        assert (X0.size(2) == X1.size(2))

        intWidth = X0.size(2)
        intHeight = X0.size(1)
        channel = X0.size(0)
        if not channel == 3:
            continue

        if intWidth != ((intWidth >> 7) << 7):
            intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary
            intPaddingLeft =int(( intWidth_pad - intWidth)/2)
            intPaddingRight = intWidth_pad - intWidth - intPaddingLeft
        else:
            intWidth_pad = intWidth
            intPaddingLeft = 32
            intPaddingRight= 32

        if intHeight != ((intHeight >> 7) << 7):
            intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary
            intPaddingTop = int((intHeight_pad - intHeight) / 2)
            intPaddingBottom = intHeight_pad - intHeight - intPaddingTop
        else:
            intHeight_pad = intHeight
            intPaddingTop = 32
            intPaddingBottom = 32

        pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])

        torch.set_grad_enabled(False)
        X0 = Variable(torch.unsqueeze(X0,0))
        X1 = Variable(torch.unsqueeze(X1,0))
        X0 = pader(X0)
        X1 = pader(X1)

        if use_cuda:
            X0 = X0.cuda()
            X1 = X1.cuda()
        proc_end = time.time()
        y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))
        y_ = y_s[save_which]

        proc_timer.update(time.time() -proc_end)
        tot_timer.update(time.time() - end)
        end  = time.time()
        print("*****************current image process time \t " + str(time.time()-proc_end )+"s ******************" )
        if use_cuda:
            X0 = X0.data.cpu().numpy()
            if not isinstance(y_, list):
                y_ = y_.data.cpu().numpy()
            else:
                y_ = [item.data.cpu().numpy() for item in y_]
            offset = [offset_i.data.cpu().numpy() for offset_i in offset]
            filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None
            X1 = X1.data.cpu().numpy()
        else:
            X0 = X0.data.numpy()
            if not isinstance(y_, list):
                y_ = y_.data.numpy()
            else:
                y_ = [item.data.numpy() for item in y_]
            offset = [offset_i.data.numpy() for offset_i in offset]
            filter = [filter_i.data.numpy() for filter_i in filter]
            X1 = X1.data.numpy()


        X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))
        y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,
                                  intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]
        offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]
        filter = [np.transpose(
            filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],
            (1, 2, 0)) for filter_i in filter]  if filter is not None else None
        X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))

        timestep = args.time_step
        numFrames = int(1.0 / timestep) - 1
        time_offsets = [kk * timestep for kk in range(1, 1 + numFrames, 1)]
        # for item, time_offset  in zip(y_,time_offsets):
        #     arguments_strOut = os.path.join(gen_dir, dir, "frame10_i{:.3f}_11.png".format(time_offset))
        #
        #     imsave(arguments_strOut, np.round(item).astype(numpy.uint8))
        #
        # # copy the first and second reference frame
        # shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir,  "frame10_i{:.3f}_11.png".format(0)))
        # shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir,  "frame11_i{:.3f}_11.png".format(1)))

        count = 0
        shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count)))
        count  = count+1
        for item, time_offset in zip(y_, time_offsets):
            arguments_strOut = os.path.join(gen_dir, dir, "{:0>4d}.png".format(count))
            count = count + 1
            imsave(arguments_strOut, np.round(item).astype(numpy.uint8))
        shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, "{:0>4d}.png".format(count)))
        count = count + 1


================================================
FILE: environment.yaml
================================================
name: pytorch1.0.0
channels:
  - pytorch
  - serge-sans-paille
  - anaconda
  - conda-forge
  - defaults
dependencies:
  - ca-certificates=2019.1.23=0
  - certifi=2018.11.29=py36_0
  - cloudpickle=0.7.0=py_0
  - cytoolz=0.9.0.1=py36h14c3975_1
  - dask-core=1.1.1=py_0
  - decorator=4.3.2=py36_0
  - imageio=2.4.1=py36_0
  - networkx=2.2=py36_1
  - openssl=1.1.1=h7b6447c_0
  - pywavelets=1.0.1=py36hdd07704_0
  - scikit-image=0.14.1=py36he6710b0_0
  - scipy=1.1.0=py36h7c811a0_0
  - toolz=0.9.0=py36_0
  - cycler=0.10.0=py_1
  - expat=2.2.5=hf484d3e_1002
  - fontconfig=2.13.1=h2176d3f_1000
  - gettext=0.19.8.1=h9745a5d_1001
  - glib=2.56.2=had28632_1001
  - icu=58.2=hf484d3e_1000
  - kiwisolver=1.0.1=py36h6bb024c_1002
  - libiconv=1.15=h14c3975_1004
  - libprotobuf=3.6.1=hdbcaa40_1000
  - libuuid=2.32.1=h14c3975_1000
  - libxcb=1.13=h14c3975_1002
  - libxml2=2.9.8=h143f9aa_1005
  - matplotlib=3.0.2=py36_1002
  - matplotlib-base=3.0.2=py36h167e16e_1002
  - protobuf=3.6.1=py36hf484d3e_1001
  - pthread-stubs=0.4=h14c3975_1001
  - pyparsing=2.3.1=py_0
  - pyqt=5.6.0=py36h13b7fb3_1008
  - python-dateutil=2.8.0=py_0
  - sip=4.18.1=py36hf484d3e_1000
  - tensorboardx=1.6=py_0
  - tk=8.6.9=h84994c4_1000
  - tornado=5.1.1=py36h14c3975_1000
  - xorg-libxau=1.0.9=h14c3975_0
  - xorg-libxdmcp=1.1.2=h14c3975_1007
  - blas=1.0=mkl
  - cffi=1.11.5=py36he75722e_1
  - cudatoolkit=9.0=h13b8566_0
  - dbus=1.13.2=h714fa37_1
  - freetype=2.9.1=h8a8886c_1
  - gst-plugins-base=1.14.0=hbbd80ab_1
  - gstreamer=1.14.0=hb453b48_1
  - intel-openmp=2019.1=144
  - isl=0.12.2=0
  - jpeg=9b=h024ee3a_2
  - libedit=3.1.20181209=hc058e9b_0
  - libffi=3.2.1=hd88cf55_4
  - libgcc-ng=8.2.0=hdf63c60_1
  - libgfortran-ng=7.3.0=hdf63c60_0
  - libpng=1.6.36=hbc83047_0
  - libstdcxx-ng=8.2.0=hdf63c60_1
  - libtiff=4.0.10=h2733197_2
  - mkl=2019.1=144
  - mkl_fft=1.0.10=py36ha843d7b_0
  - mkl_random=1.0.2=py36hd81dba3_0
  - mpc=1.0.3=hf803216_4
  - mpfr=3.1.5=h12ff648_1
  - ncurses=6.1=he6710b0_1
  - ninja=1.8.2=py36h6bb024c_1
  - numpy=1.15.4=py36h7e9f1db_0
  - numpy-base=1.15.4=py36hde5b4d6_0
  - olefile=0.46=py36_0
  - pcre=8.42=h439df22_0
  - pillow=5.4.1=py36h34e0f95_0
  - pip=19.0.1=py36_0
  - pycparser=2.19=py36_0
  - python=3.6.8=h0371630_0
  - qt=5.6.3=h8bf5577_3
  - readline=7.0=h7b6447c_5
  - setuptools=40.8.0=py36_0
  - six=1.12.0=py36_0
  - sqlite=3.26.0=h7b6447c_0
  - wheel=0.32.3=py36_0
  - xz=5.2.4=h14c3975_4
  - zlib=1.2.11=h7b6447c_3
  - zstd=1.3.7=h0b5b093_0
  - pytorch=1.0.1=py3.6_cuda9.0.176_cudnn7.4.2_2
  - torchvision=0.2.1=py_2
  - cloog=0.18.1=1
  - gcc_49=4.9.1=6
  - gmp=5.1.3=0
  - pip:
    - correlation-cuda==0.0.0
    - dask==1.1.1
    - depthflowprojection-cuda==0.0.0
    - filterinterpolation-cuda==0.0.0
    - flowprojection-cuda==0.0.0
    - interpolation-cuda==0.0.0
    - interpolationch-cuda==0.0.0
    - mindepthflowprojection-cuda==0.0.0
    - separableconv-cuda==0.0.0
    - separableconvflow-cuda==0.0.0
    - torch==1.0.1.post2
prefix: /home/wenbobao/anaconda3_new/envs/pytorch1.0.0


================================================
FILE: loss_function.py
================================================
import sys
import os

import sys
import  threading
import torch
from torch.autograd import Variable
from lr_scheduler import *
from torch.autograd import gradcheck

import numpy


def charbonier_loss(x,epsilon):
    loss = torch.mean(torch.sqrt(x * x + epsilon * epsilon))
    return loss
def negPSNR_loss(x,epsilon):
    loss = torch.mean(torch.mean(torch.mean(torch.sqrt(x * x + epsilon * epsilon),dim=1),dim=1),dim=1)
    return torch.mean(-torch.log(1.0/loss) /100.0)

def tv_loss(x,epsilon):
    loss = torch.mean( torch.sqrt(
        (x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) ** 2 +
        (x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) ** 2 + epsilon *epsilon
            )
        )
    return loss

    
def gra_adap_tv_loss(flow, image, epsilon):
    w = torch.exp( - torch.sum(	torch.abs(image[:,:,:-1, :-1] - image[:,:,1:, :-1]) + 
                            torch.abs(image[:,:,:-1, :-1] - image[:,:,:-1, 1:]), dim = 1))		
    tv = torch.sum(torch.sqrt((flow[:, :, :-1, :-1] - flow[:, :, 1:, :-1]) ** 2 + (flow[:, :, :-1, :-1] - flow[:, :, :-1, 1:]) ** 2 + epsilon *epsilon) ,dim=1)             
    loss = torch.mean( w * tv )
    return loss	
        
def smooth_loss(x,epsilon):
    loss = torch.mean(
        torch.sqrt(
            (x[:,:,:-1,:-1] - x[:,:,1:,:-1]) **2 +
            (x[:,:,:-1,:-1] - x[:,:,:-1,1:]) **2+ epsilon**2
        )
    )
    return loss
    
    
def motion_sym_loss(offset, epsilon, occlusion = None):
    if occlusion == None:
        # return torch.mean(torch.sqrt( (offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))
        return torch.mean(torch.sqrt( (offset[0] + offset[1])**2 + epsilon **2))
    else:
        # TODO: how to design the occlusion aware offset symmetric loss?
        # return torch.mean(torch.sqrt((offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))
        return torch.mean(torch.sqrt((offset[0] + offset[1])**2 + epsilon **2))


def part_loss(diffs, offsets, occlusions, images, epsilon, use_negPSNR=False):
    if use_negPSNR:
        pixel_loss = [negPSNR_loss(diff, epsilon) for diff in diffs]
    else:
        pixel_loss = [charbonier_loss(diff, epsilon) for diff in diffs]
    #offset_loss = [tv_loss(offset[0], epsilon) + tv_loss(offset[1], epsilon) for offset in
    #               offsets]

    if offsets[0][0] is not None:
        offset_loss = [gra_adap_tv_loss(offset[0],images[0], epsilon) + gra_adap_tv_loss(offset[1], images[1], epsilon) for offset in
                   offsets]
    else:
        offset_loss = [Variable(torch.zeros(1).cuda())]
    # print(torch.max(occlusions[0]))
    # print(torch.min(occlusions[0]))
    # print(torch.mean(occlusions[0]))

    # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion - 0.5, epsilon) for occlusion in occlusions]
    # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion[:, 0, ...] - occlusion[:, 1, ...], epsilon) for occlusion in occlusions]


    sym_loss = [motion_sym_loss(offset,epsilon=epsilon) for offset in offsets]
    # sym_loss = [ motion_sym_loss(offset,occlusion) for offset,occlusion in zip(offsets,occlusions)]
    return pixel_loss, offset_loss, sym_loss


================================================
FILE: lr_scheduler.py
================================================
from bisect import bisect_right
from torch.optim.optimizer import Optimizer


class _LRScheduler(object):
    def __init__(self, optimizer, last_epoch=-1):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.step(last_epoch + 1)
        self.last_epoch = last_epoch

    def get_lr(self):
        raise NotImplementedError

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


class LambdaLR(_LRScheduler):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.

    Args:
        optimizer (Optimizer): Wrapped optimizer.
        lr_lambda (function or list): A function which computes a multiplicative
            factor given an integer parameter epoch, or a list of such
            functions, one for each group in optimizer.param_groups.
        last_epoch (int): The index of last epoch. Default: -1.

    Example:
        >>> # Assuming optimizer has two groups.
        >>> lambda1 = lambda epoch: epoch // 30
        >>> lambda2 = lambda epoch: 0.95 ** epoch
        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
        >>> for epoch in range(100):
        >>>     scheduler.step()
        >>>     train(...)
        >>>     validate(...)
    """
    def __init__(self, optimizer, lr_lambda, last_epoch=-1):
        self.optimizer = optimizer
        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
        else:
            if len(lr_lambda) != len(optimizer.param_groups):
                raise ValueError("Expected {} lr_lambdas, but got {}".format(
                    len(optimizer.param_groups), len(lr_lambda)))
            self.lr_lambdas = list(lr_lambda)
        self.last_epoch = last_epoch
        super(LambdaLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return [base_lr * lmbda(self.last_epoch)
                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]


class StepLR(_LRScheduler):
    """Sets the learning rate of each parameter group to the initial lr
    decayed by gamma every step_size epochs. When last_epoch=-1, sets
    initial lr as lr.

    Args:
        optimizer (Optimizer): Wrapped optimizer.
        step_size (int): Period of learning rate decay.
        gamma (float): Multiplicative factor of learning rate decay.
            Default: 0.1.
        last_epoch (int): The index of last epoch. Default: -1.

    Example:
        >>> # Assuming optimizer uses lr = 0.5 for all groups
        >>> # lr = 0.05     if epoch < 30
        >>> # lr = 0.005    if 30 <= epoch < 60
        >>> # lr = 0.0005   if 60 <= epoch < 90
        >>> # ...
        >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
        >>> for epoch in range(100):
        >>>     scheduler.step()
        >>>     train(...)
        >>>     validate(...)
    """

    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
        self.step_size = step_size
        self.gamma = gamma
        super(StepLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
                for base_lr in self.base_lrs]


class MultiStepLR(_LRScheduler):
    """Set the learning rate of each parameter group to the initial lr decayed
    by gamma once the number of epoch reaches one of the milestones. When
    last_epoch=-1, sets initial lr as lr.

    Args:
        optimizer (Optimizer): Wrapped optimizer.
        milestones (list): List of epoch indices. Must be increasing.
        gamma (float): Multiplicative factor of learning rate decay.
            Default: 0.1.
        last_epoch (int): The index of last epoch. Default: -1.

    Example:
        >>> # Assuming optimizer uses lr = 0.5 for all groups
        >>> # lr = 0.05     if epoch < 30
        >>> # lr = 0.005    if 30 <= epoch < 80
        >>> # lr = 0.0005   if epoch >= 80
        >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
        >>> for epoch in range(100):
        >>>     scheduler.step()
        >>>     train(...)
        >>>     validate(...)
    """

    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
        if not list(milestones) == sorted(milestones):
            raise ValueError('Milestones should be a list of'
                             ' increasing integers. Got {}', milestones)
        self.milestones = milestones
        self.gamma = gamma
        super(MultiStepLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
                for base_lr in self.base_lrs]


class ExponentialLR(_LRScheduler):
    """Set the learning rate of each parameter group to the initial lr decayed
    by gamma every epoch. When last_epoch=-1, sets initial lr as lr.

    Args:
        optimizer (Optimizer): Wrapped optimizer.
        gamma (float): Multiplicative factor of learning rate decay.
        last_epoch (int): The index of last epoch. Default: -1.
    """

    def __init__(self, optimizer, gamma, last_epoch=-1):
        self.gamma = gamma
        super(ExponentialLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        return [base_lr * self.gamma ** self.last_epoch
                for base_lr in self.base_lrs]


class ReduceLROnPlateau(object):
    """Reduce learning rate when a metric has stopped improving.
    Models often benefit from reducing the learning rate by a factor
    of 2-10 once learning stagnates. This scheduler reads a metrics
    quantity and if no improvement is seen for a 'patience' number
    of epochs, the learning rate is reduced.

    Args:
        optimizer (Optimizer): Wrapped optimizer.
        mode (str): One of `min`, `max`. In `min` mode, lr will
            be reduced when the quantity monitored has stopped
            decreasing; in `max` mode it will be reduced when the
            quantity monitored has stopped increasing. Default: 'min'.
        factor (float): Factor by which the learning rate will be
            reduced. new_lr = lr * factor. Default: 0.1.
        patience (int): Number of epochs with no improvement after
            which learning rate will be reduced. Default: 10.
        verbose (bool): If True, prints a message to stdout for
            each update. Default: False.
        threshold (float): Threshold for measuring the new optimum,
            to only focus on significant changes. Default: 1e-4.
        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
            dynamic_threshold = best * ( 1 + threshold ) in 'max'
            mode or best * ( 1 - threshold ) in `min` mode.
            In `abs` mode, dynamic_threshold = best + threshold in
            `max` mode or best - threshold in `min` mode. Default: 'rel'.
        cooldown (int): Number of epochs to wait before resuming
            normal operation after lr has been reduced. Default: 0.
        min_lr (float or list): A scalar or a list of scalars. A
            lower bound on the learning rate of all param groups
            or each group respectively. Default: 0.
        eps (float): Minimal decay applied to lr. If the difference
            between new and old lr is smaller than eps, the update is
            ignored. Default: 1e-8.

    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> scheduler = ReduceLROnPlateau(optimizer, 'min')
        >>> for epoch in range(10):
        >>>     train(...)
        >>>     val_loss = validate(...)
        >>>     # Note that step should be called after validate()
        >>>     scheduler.step(val_loss)
    """

    def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
                 verbose=False, threshold=1e-4, threshold_mode='rel',
                 cooldown=0, min_lr=0, eps=1e-8):

        if factor >= 1.0:
            raise ValueError('Factor should be < 1.0.')
        self.factor = factor

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(min_lr, list) or isinstance(min_lr, tuple):
            if len(min_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} min_lrs, got {}".format(
                    len(optimizer.param_groups), len(min_lr)))
            self.min_lrs = list(min_lr)
        else:
            self.min_lrs = [min_lr] * len(optimizer.param_groups)

        self.patience = patience
        self.verbose = verbose
        self.cooldown = cooldown
        self.cooldown_counter = 0
        self.mode = mode
        self.threshold = threshold
        self.threshold_mode = threshold_mode
        self.best = None
        self.num_bad_epochs = None
        self.mode_worse = None  # the worse value for the chosen mode
        self.is_better = None
        self.eps = eps
        self.last_epoch = -1
        self._init_is_better(mode=mode, threshold=threshold,
                             threshold_mode=threshold_mode)
        self._reset()

    def _reset(self):
        """Resets num_bad_epochs counter and cooldown counter."""
        self.best = self.mode_worse
        self.cooldown_counter = 0
        self.num_bad_epochs = 0

    def step(self, metrics, epoch=None):
        current = metrics
        if epoch is None:
            epoch = self.last_epoch = self.last_epoch + 1
        self.last_epoch = epoch

        if self.is_better(current, self.best):
            self.best = current
            self.num_bad_epochs = 0
        else:
            self.num_bad_epochs += 1

        if self.in_cooldown:
            self.cooldown_counter -= 1
            self.num_bad_epochs = 0  # ignore any bad epochs in cooldown

        if self.num_bad_epochs > self.patience:
            self._reduce_lr(epoch)
            self.cooldown_counter = self.cooldown
            self.num_bad_epochs = 0

    def _reduce_lr(self, epoch):
        for i, param_group in enumerate(self.optimizer.param_groups):
            old_lr = float(param_group['lr'])
            new_lr = max(old_lr * self.factor, self.min_lrs[i])
            if old_lr - new_lr > self.eps:
                param_group['lr'] = new_lr
                if self.verbose:
                    print('Epoch {:5d}: reducing learning rate'
                          ' of group {} to {:.4e}.'.format(epoch, i, new_lr))

    @property
    def in_cooldown(self):
        return self.cooldown_counter > 0

    def _init_is_better(self, mode, threshold, threshold_mode):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if threshold_mode not in {'rel', 'abs'}:
            raise ValueError('threshold mode ' + mode + ' is unknown!')
        if mode == 'min' and threshold_mode == 'rel':
            rel_epsilon = 1. - threshold
            self.is_better = lambda a, best: a < best * rel_epsilon
            self.mode_worse = float('Inf')
        elif mode == 'min' and threshold_mode == 'abs':
            self.is_better = lambda a, best: a < best - threshold
            self.mode_worse = float('Inf')
        elif mode == 'max' and threshold_mode == 'rel':
            rel_epsilon = threshold + 1.
            self.is_better = lambda a, best: a > best * rel_epsilon
            self.mode_worse = -float('Inf')
        else:  # mode == 'max' and epsilon_mode == 'abs':
            self.is_better = lambda a, best: a > best + threshold
            self.mode_worse = -float('Inf')

================================================
FILE: my_args.py
================================================
import os
import datetime
import argparse
import numpy
import networks
import  torch
modelnames =  networks.__all__
# import datasets
datasetNames = ('Vimeo_90K_interp') #datasets.__all__

parser = argparse.ArgumentParser(description='DAIN')

parser.add_argument('--debug',action = 'store_true', help='Enable debug mode')
parser.add_argument('--netName', type=str, default='DAIN',
                    choices = modelnames,help = 'model architecture: ' +
                        ' | '.join(modelnames) +
                        ' (default: DAIN)')

parser.add_argument('--datasetName', default='Vimeo_90K_interp',
                    choices= datasetNames,nargs='+',
                    help='dataset type : ' +
                        ' | '.join(datasetNames) +
                        ' (default: Vimeo_90K_interp)')
parser.add_argument('--datasetPath',default='',help = 'the path of selected datasets')
parser.add_argument('--dataset_split', type = int, default=97, help = 'Split a dataset into trainining and validation by percentage (default: 97)')

parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')

parser.add_argument('--numEpoch', '-e', type = int, default=100, help= 'Number of epochs to train(default:150)')

parser.add_argument('--batch_size', '-b',type = int ,default=1, help = 'batch size (default:1)' )
parser.add_argument('--workers', '-w', type =int,default=8, help = 'parallel workers for loading training samples (default : 1.6*10 = 16)')
parser.add_argument('--channels', '-c', type=int,default=3,choices = [1,3], help ='channels of images (default:3)')
parser.add_argument('--filter_size', '-f', type=int, default=4, help = 'the size of filters used (default: 4)',
                    choices=[2,4,6, 5,51]
                    )


parser.add_argument('--lr', type =float, default= 0.002, help= 'the basic learning rate for three subnetworks (default: 0.002)')
parser.add_argument('--rectify_lr', type=float, default=0.001, help  = 'the learning rate for rectify/refine subnetworks (default: 0.001)')

parser.add_argument('--save_which', '-s', type=int, default=1, choices=[0,1], help='choose which result to save: 0 ==> interpolated, 1==> rectified')
parser.add_argument('--time_step',  type=float, default=0.5, help='choose the time steps')
parser.add_argument('--flow_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')
parser.add_argument('--occ_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--filter_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--ctx_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')
parser.add_argument('--depth_lr_coe', type = float, default=0.001, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')
# parser.add_argument('--deblur_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')

parser.add_argument('--alpha', type=float,nargs='+', default=[0.0, 1.0], help= 'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])')

parser.add_argument('--epsilon', type = float, default=1e-6, help = 'the epsilon for charbonier loss,etc (default: 1e-6)')
parser.add_argument('--weight_decay', type = float, default=0, help = 'the weight decay for whole network ' )
parser.add_argument('--patience', type=int, default=5, help = 'the patience of reduce on plateou')
parser.add_argument('--factor', type = float, default=0.2, help = 'the factor of reduce on plateou')
#
parser.add_argument('--pretrained', dest='SAVED_MODEL', default=None, help ='path to the pretrained model weights')
parser.add_argument('--no-date', action='store_true', help='don\'t append date timestamp to folder' )
parser.add_argument('--use_cuda', default= True, type = bool, help='use cuda or not')
parser.add_argument('--use_cudnn',default=1,type=int, help = 'use cudnn or not')
parser.add_argument('--dtype', default=torch.cuda.FloatTensor, choices = [torch.cuda.FloatTensor,torch.FloatTensor],help = 'tensor data type ')
# parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint (default: none)')


parser.add_argument('--uid', type=str, default= None, help='unique id for the training')
parser.add_argument('--force', action='store_true', help='force to override the given uid')

# Colab version
parser.add_argument('--start_frame', type = int, default = 1, help='first frame number to process')
parser.add_argument('--end_frame', type = int, default = 100, help='last frame number to process')
parser.add_argument('--frame_input_dir', type = str, default = '/content/DAIN/input_frames', help='frame input directory')
parser.add_argument('--frame_output_dir', type = str, default = '/content/DAIN/output_frames', help='frame output directory')

args = parser.parse_args()

import shutil

if args.uid == None:
    unique_id = str(numpy.random.randint(0, 100000))
    print("revise the unique id to a random numer " + str(unique_id))
    args.uid = unique_id
    timestamp = datetime.datetime.now().strftime("%a-%b-%d-%H-%M")
    save_path = './model_weights/'+ args.uid  +'-' + timestamp
else:
    save_path = './model_weights/'+ str(args.uid)

# print("no pth here : " + save_path + "/best"+".pth")
if not os.path.exists(save_path + "/best"+".pth"):
    # print("no pth here : " + save_path + "/best" + ".pth")
    os.makedirs(save_path,exist_ok=True)
else:
    if not args.force:
        raise("please use another uid ")
    else:
        print("override this uid" + args.uid)
        for m in range(1,10):
            if not os.path.exists(save_path+"/log.txt.bk" + str(m)):
                shutil.copy(save_path+"/log.txt", save_path+"/log.txt.bk"+str(m))
                shutil.copy(save_path+"/args.txt", save_path+"/args.txt.bk"+str(m))
                break


parser.add_argument('--save_path',default=save_path,help = 'the output dir of weights')
parser.add_argument('--log', default = save_path+'/log.txt', help = 'the log file in training')
parser.add_argument('--arg', default = save_path+'/args.txt', help = 'the args used')

args = parser.parse_args()


with open(args.log, 'w') as f:
    f.close()
with open(args.arg, 'w') as f:
    print(args)
    print(args,file=f)
    f.close()
if args.use_cudnn:
    print("cudnn is used")
    torch.backends.cudnn.benchmark = True  # to speed up the
else:
    print("cudnn is not used")
    torch.backends.cudnn.benchmark = False  # to speed up the


================================================
FILE: my_package/DepthFlowProjection/DepthFlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
#import _ext.my_lib as my_lib
import depthflowprojection_cuda as my_lib

class DepthFlowProjectionLayer(Function):
    def __init__(self,requires_grad):
        super(DepthFlowProjectionLayer,self).__init__()
        # self.requires_grad = requires_grad

    @staticmethod
    def forward(ctx, input1, input2, requires_grad):
        # print("Depth Aware Flow Projection")
        assert(input1.is_contiguous())
        assert(input2.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        # self.input2 = input2.contiguous()
        fillhole = 1 if requires_grad == False else 0
        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
        # output = torch.zeros(input1.size())

        if input1.is_cuda:
            # output = output.cuda()
            # count = count.cuda()
            # print("correct")
            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.DepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
            output = torch.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.DepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)
        if err != 0:
            print(err)
        # output = output/count # to divide the counter

        # self.count = count #to keep this
        # self.output = output

        ctx.save_for_backward(input1, input2,count,output)
        ctx.fillhole = fillhole

        # print(self.input1[0, 0, :10, :10])
        # print(self.count[0, 0, :10, :10])
        # print(self.input1[0, 0, -10:, -10:])
        # print(self.count[0, 0, -10:, -10:])

        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Filter Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())

        input1, input2, count, output = ctx.saved_tensors
        # fillhole = ctx.fillhole

        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()

            err = my_lib.DepthFlowProjectionLayer_gpu_backward(input1,input2,
                                                               count, output,
                                                               gradoutput, gradinput1,gradinput2)
            # print(err)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
            err = my_lib.DepthFlowProjectionLayer_cpu_backward(input1, input2,
                                                               count, output,
                                                               gradoutput, gradinput1,gradinput2)
            # print(err)
            if err != 0:
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1,gradinput2,None


================================================
FILE: my_package/DepthFlowProjection/DepthFlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn.modules.module import Module
from .DepthFlowProjectionLayer import DepthFlowProjectionLayer #, FlowFillholeLayer

__all__ =['DepthFlowProjectionModule']

class DepthFlowProjectionModule(Module):
    def __init__(self, requires_grad = True):
        super(DepthFlowProjectionModule, self).__init__()
        self.requires_grad = requires_grad
        # self.f = DepthFlowProjectionLayer(requires_grad)

    def forward(self, input1, input2):
        return DepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)

# class FlowFillholeModule(Module):
#     def __init__(self,hole_value = -10000.0):
#         super(FlowFillholeModule, self).__init__()
#         self.f = FlowFillholeLayer()
#
#     def forward(self, input1):
#         return self.f(input1)

    #we actually dont need to write the backward code for a module, since we have


================================================
FILE: my_package/DepthFlowProjection/__init__.py
================================================
from  .DepthFlowProjectionModule import *


================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "depthflowprojection_cuda_kernel.cuh"


int DepthFlowProjectionLayer_gpu_forward(
		at::Tensor&  input1,
        at::Tensor&  input2,
        at::Tensor&  count,
		at::Tensor&  output,
		int fillhole
		)
{

	int error = 1 ;

	int channel = input1.size( 1);
	if(channel!= 2) return error;
	int batch = input1.size(0);

	int h = input1.size(2);
	int w = input1.size(3);

    if(input2.size(1) !=1 ) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);
//    printf("In gpu forward\n");
	error = DepthFlowProjection_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,fillhole,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
			input2,
			count,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}

int DepthFlowProjectionLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
        at::Tensor&  count,
		at::Tensor&  output,
        at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{
	int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=2) return error;
	int batch = input1.size(0);
	if(count.size( 0) != batch) return error;
	if(count.size(1) != 1) return error;

	int h = input1.size(2);
	int w = input1.size(3);
    if(input2.size(1) !=1 ) return error;
    if(count.size(2) != h) return error;// to add some checkpoint
	if(count.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
//    printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = DepthFlowProjection_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
            input2,
            count,
            output,
			gradoutput,
			gradinput1,
			gradinput2
			);
	  if (error) {AT_ERROR("CUDA call failed");}
	  //printf("Am I good in backward function %d",error);

	return error;

}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("DepthFlowProjectionLayer_gpu_forward", &DepthFlowProjectionLayer_gpu_forward, "DepthFlowProjection forward (CUDA)");
  m.def("DepthFlowProjectionLayer_gpu_backward", &DepthFlowProjectionLayer_gpu_backward, "DepthFlowProjection backward (CUDA)");
}


================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cu
================================================
#include <stdio.h>

#include "depthflowprojection_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void DepthFlowProjection_gpu_forward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t* count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];

        float x2 = (float) (w_i) + fx;
        float y2 = (float) (h_i) + fy;
        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
            int ix2_L = (int) (x2);
            int iy2_T = (int) (y2);
            int ix2_R = min(ix2_L + 1, w - 1);
            int iy2_B = min(iy2_T + 1, h - 1);

            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];

            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,- temp * fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-temp * fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-temp * fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-temp * fx);

            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -temp * fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]  , -temp * fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  , -temp * fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  , -temp * fy);

            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], temp * 1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ,temp *  1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , temp * 1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ,temp *  1);
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void DepthFlowProjectionAveraging_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t*  count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
	    float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
        if(temp > 0.0f){
            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void DepthFlowFillhole_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t*  count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
	    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
        if(temp <= 0.0f){
            //search along the four directions,0/90/180/270, until finding at least one
            int left_offset = w_i;            float left_temp = 0.0f;
            while(left_temp == 0.0f && left_offset - 1 >= 0){
                left_offset = left_offset - 1;
                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
            }

            int right_offset = w_i ;            float right_temp = 0.0f;
            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
                right_offset  = right_offset + 1 ;
                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
            }

            int up_offset = h_i ;            float up_temp = 0.0f;
            while(up_temp == 0.0f && up_offset - 1 >=0){
                up_offset = up_offset - 1;
                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
            }

            int down_offset = h_i;            float down_temp = 0.0f;
            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
                down_offset = down_offset + 1;
                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
            }

            if(left_temp + right_temp + up_temp + down_temp <=0.0f){
                //printf("Can't fill hole, find no neighbor vectors availabel\n");
                return;
            }

            left_temp = (left_temp > 0.0f)?1:0;
            right_temp = (right_temp > 0.0f)?1:0;
            up_temp = (up_temp > 0.0f)?1:0;
            down_temp = (down_temp > 0.0f)?1:0;

            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;


            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void DepthFlowProjection_gpu_backward_kernelfunc(
		const int nElement,  	const int w, 	const int h, const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t*  count,
		scalar_t* output,
		const scalar_t* __restrict__  gradoutput,
		scalar_t*  gradinput1,
		scalar_t*  gradinput2
		)
{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){
        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;

        float x2 = (float) ( w_i ) + fx;
        float y2 = (float) ( h_i ) + fy;
        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
            int ix2_L = (int)(x2);
            int iy2_T = (int)(y2);
            int ix2_R  = min(ix2_L + 1, w-1);
            int iy2_B  = min(iy2_T + 1, h-1);

            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];

            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
            gradinput1[iu_offset] += -  gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp /
                                        count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L]  ;
            gradinput1[iu_offset] += -    gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]  * temp /
                                         count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R]  ;
            gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /
                                         count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;
            gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /
                                         count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ;

            int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]  * temp /
                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L]  ;
            gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] * temp /
                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]  ;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /
                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]     ;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /
                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R]   ;


            int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;
            gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
                                            (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
                                            (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
                                            (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
                                            (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );

            gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
                                            (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
                                            (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
                                            (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
                                            (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
        }
	}
	return ;

}


int DepthFlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,	at::Tensor&  input2,
		at::Tensor&  count,
		at::Tensor&  output
		)
{
    int error = -1;


	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
//    printf("I am here\n");
	//extract the data of CudaTensor and use kernel to calculate.
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {

	DepthFlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
			);
			
	}));
			
    cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am there\n");

	AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjectionAveraging", ([&] {

    DepthFlowProjectionAveraging_kernelfunc<<<grid,block,0,stream>>>(
    		nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
    );
		}));

//    printf("I am kao\n");

	//			THCudaCheck(cudaGetLastError());
    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am dd\n");

    if(fillhole){

//        printf("use flow fill hole\n");
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowFillhole", ([&] {

        DepthFlowFillhole_kernelfunc<<<grid,block,0,stream>>>(
    		nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
        );
		}));

    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		return error;
	}

    }

	error = 0;
	return error;

}

int DepthFlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,		at::Tensor&  input2,
		at::Tensor&  count,        at::Tensor&  output,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{

	int error = -1;

	dim3 grid;
	dim3 block;

	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {

	DepthFlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>(),
			gradoutput.data<scalar_t>(), gradinput1.data<scalar_t>(), gradinput2.data<scalar_t>()
			);
					}));

//    printf("gpu I am there\n");

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("gpu I am here\n");

	error = 0;
	return error;


}

================================================
FILE: my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int DepthFlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,		at::Tensor&  input2,
		at::Tensor&  count,
		at::Tensor&  output

		);

int DepthFlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
        at::Tensor&  count,
        at::Tensor&  output,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		);


================================================
FILE: my_package/DepthFlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='depthflowprojection_cuda',
    ext_modules=[
        CUDAExtension('depthflowprojection_cuda', [
            'depthflowprojection_cuda.cc',
            'depthflowprojection_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/FilterInterpolation/FilterInterpolationLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import filterinterpolation_cuda as my_lib

#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class FilterInterpolationLayer(Function):
    def __init__(self):
        super(FilterInterpolationLayer,self).__init__()
    @staticmethod
    def forward(ctx, input1,input2,input3):

        assert(input1.is_contiguous())
        assert(input2.is_contiguous())
        assert (input3.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
        # self.input3 = input3.contiguous()

        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # output =  torch.zeros(input1.size())


        if input1.is_cuda :
            # output = output.cuda()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            my_lib.FilterInterpolationLayer_gpu_forward(input1, input2, input3, output)
        else:
            output = torch.FloatTensor(input1.data.size())
            my_lib.FilterInterpolationLayer_cpu_forward(input1, input2, input3, output)

        ctx.save_for_backward(input1, input2,input3)
        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Filter Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())
        # gradinput2 = torch.zeros(self.input2.size())
        # gradinput3 = torch.zeros(self.input3.size())

        input1, input2, input3= ctx.saved_tensors

        gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
        gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
        gradinput3 = torch.cuda.FloatTensor().resize_(input3.size()).zero_()
        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            # gradinput2 = gradinput2.cuda(self.device)
            # gradinput3 = gradinput3.cuda(self.device)

            err = my_lib.FilterInterpolationLayer_gpu_backward(input1,input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            err = my_lib.FilterInterpolationLayer_cpu_backward(input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, gradinput2,gradinput3

# calculate the weights of flow         
class WeightLayer(Function):
    def __init__(self, lambda_e = 10.0/255.0, lambda_v = 1.0, Nw = 3):
        #lambda_e = 10.0 , lambda_v = 1.0,  Nw = 3,
        super(WeightLayer,self).__init__()
        self.lambda_e = lambda_e
        self.lambda_v = lambda_v
        self.Nw = Nw

    # flow1_grad
    def forward(self, input1,input2,input3):

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        self.input1 = input1.contiguous() # ref1 image
        self.input2 = input2.contiguous() # ref2 image
        self.input3 = input3.contiguous()
        # self.flow1_grad = flow1_grad.contiguous() # ref1 flow's grad

        if input1.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        output =  torch.zeros(input1.size(0), 1 , input1.size(2), input1.size(3))

        if input1.is_cuda :
            output = output.cuda()
            err = my_lib.WeightLayer_gpu_forward(input1, input2, input3,
                                                 # flow1_grad,
                                                 output,
                 self.lambda_e,  self.lambda_v, self.Nw
            )
            if err != 0 :
                print(err)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.WeightLayer_cpu_forward(input1, input2, input3,  output,
                 self.lambda_e ,  self.lambda_v, self.Nw
            )
            if err != 0 :
                print(err)

        self.output = output # save this for fast back propagation
        #  the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        # print("Backward of WeightLayer Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        gradinput1 = torch.zeros(self.input1.size())
        gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        # gradflow1_grad = torch.zeros(self.flow1_grad.size())
        if self.input1.is_cuda:
            #print("CUDA backward")
            gradinput1 = gradinput1.cuda(self.device)
            gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)
            # gradflow1_grad = gradflow1_grad.cuda(self.device)

            err = my_lib.WeightLayer_gpu_backward(
                self.input1,self.input2,self.input3, self.output,
                gradoutput,
                gradinput1, gradinput2, gradinput3,
                self.lambda_e,  self.lambda_v, self.Nw
            )
            if err != 0 :
                print(err)

        else:
            #print("CPU backward")
            # print(gradoutput)
            err = my_lib.WeightLayer_cpu_backward(
                    self.input1, self.input2,self.input3, self.output,
                gradoutput,
                gradinput1, gradinput2, gradinput3,
                self.lambda_e, self.lambda_v, self.Nw
                )
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)
        # print("from 1:")
        # print(gradinput3[0,0,...])

        return gradinput1, gradinput2, gradinput3
  
class PixelValueLayer(Function):
    def __init__(self, sigma_d = 3, tao_r = 0.05, Prowindow = 2 ):
        super(PixelValueLayer,self).__init__()
     
        self.sigma_d = sigma_d
        self.tao_r = tao_r #maybe not useable
        self.Prowindow = Prowindow

    def forward(self, input1, input3, flow_weights):

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        self.input1 = input1.contiguous() # ref1 image
        #self.input2 = input2.contiguous() # ref2 image
        self.input3 = input3.contiguous() # ref1 flow
        self.flow_weights = flow_weights.contiguous() # ref1 flow weights

        if input1.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        output = torch.zeros(input1.size())
        

        if input1.is_cuda:
            output = output.cuda()            
            err = my_lib.PixelValueLayer_gpu_forward(
                input1,  input3, flow_weights,   output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.PixelValueLayer_cpu_forward(
                input1,  input3, flow_weights, output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)

        # the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        # print("Backward of PixelValueLayer Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        gradinput1 = torch.zeros(self.input1.size())
        #gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        gradflow_weights = torch.zeros(self.flow_weights.size())

        if self.input1.is_cuda:
            # print("CUDA backward")
            gradinput1 = gradinput1.cuda(self.device)
            #gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)
            gradflow_weights = gradflow_weights.cuda(self.device)

            err = my_lib.PixelValueLayer_gpu_backward(
                self.input1,self.input3, self.flow_weights,
                gradoutput,
                gradinput1,  gradinput3, gradflow_weights,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)

        else:
            #print("CPU backward")
            # print(gradoutput)
            err = my_lib.PixelValueLayer_cpu_backward(
                self.input1,  self.input3, self.flow_weights,
                gradoutput,
                gradinput1,   gradinput3, gradflow_weights,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)
        # print("from 2:")
        # print(gradinput3[0,0,...])
        # print("Image grad:")
        # print(gradinput1[0,:,:4,:4])
        # print("Flow grad:")
        # print(gradinput3[0,:,:4,:4])
        # print("Flow_weights grad:")
        # print(gradflow_weights[0,:,:4,:4])
        return gradinput1,  gradinput3, gradflow_weights

class PixelWeightLayer(Function):
    def __init__(self,threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):
        super(PixelWeightLayer,self).__init__()
        self.threshhold  = threshhold
        self.sigma_d = sigma_d
        self.tao_r = tao_r #maybe not useable
        self.Prowindow = Prowindow

    def forward(self, input3, flow_weights):

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        #self.input1 = input1.contiguous() # ref1 image
        #self.input2 = input2.contiguous() # ref2 image
        self.input3 = input3.contiguous() # ref1 flow
        self.flow_weights = flow_weights.contiguous() # ref1 flow weights

        if input3.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        output =  torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)])

        if input3.is_cuda :
            output = output.cuda()            
            err = my_lib.PixelWeightLayer_gpu_forward(
                input3, flow_weights,   output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.PixelWeightLayer_cpu_forward(
                input3, flow_weights, output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)

        self.output = output
        # the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        # print("Backward of PixelWeightLayer Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        #gradinput1 = torch.zeros(self.input1.size())
        #gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        gradflow_weights = torch.zeros(self.flow_weights.size())

        if self.input3.is_cuda:
            # print("CUDA backward")
            #gradinput1 = gradinput1.cuda(self.device)
            #gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)
            gradflow_weights = gradflow_weights.cuda(self.device)

            err = my_lib.PixelWeightLayer_gpu_backward(
                self.input3, self.flow_weights,  self.output,
                gradoutput,
                gradinput3, gradflow_weights,
                self.threshhold,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            err = my_lib.PixelWeightLayer_cpu_backward(
                self.input3, self.flow_weights, self.output,
                gradoutput,
                gradinput3, gradflow_weights,
                self.threshhold,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)
        # print("from 3:")
        # print(gradinput3[0,0,...])

        return gradinput3, gradflow_weights
		
#class ReliableValueLayer(Function):
#    def __init__(self, Nw =3, tao_r =0.05, Prowindow = 2 ):
#        super(ReliableValueLayer,self).__init__()
#     
#        self.Nw = Nw
#        self.tao_r = tao_r #maybe not useable
#        self.Prowindow = Prowindow
#
#    def forward(self, input3, flow_weight1):
#
#        # assert(input1.is_contiguous())
#        # assert(input2.is_contiguous())
#        #self.input1 = input1.contiguous() # ref1 image
#        #self.input2 = input2.contiguous() # ref2 image
#        self.input3 = input3.contiguous() # ref1 flow
#        self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights
#
#        if input3.is_cuda:
#            self.device = torch.cuda.current_device()
#        else:
#            self.device = -1
#
#        output =  torch.zeros([intpu3.size(0), 1, input3.size(2), input3.size(3)])
#        #output2 =  torch.zeros(input1.size())
#        #weight1 =  torch.zeros(input1.size())
#        #weight2 =  torch.zeros(input1.size())
#        
#
#        if input1.is_cuda :
#            output = output.cuda()            
#            my_lib.ReliableValueLayer_gpu_forward(
#                        input3, flow_weight1, output,
#                        self.sigma_d,    self.tao_r ,  self.Prowindow )
#        else:
#            # output = torch.cuda.FloatTensor(input1.data.size())
#            my_lib.ReliableValueLayer_cpu_forward(
#                        input3, flow_weight1, output,
#                        self.sigma_d,    self.tao_r ,  self.Prowindow )
#
#        # the function returns the output to its caller
#        return output
#
#    #TODO: if there are multiple outputs of this function, then the order should be well considered?
#    def backward(self, gradoutput):
#        # print("Backward of Filter Interpolation Layer")
#        # gradinput1 = input1.new().zero_()
#        # gradinput2 = input2.new().zero_()
#        #gradinput1 = torch.zeros(self.input1.size())
#        #gradinput2 = torch.zeros(self.input2.size())
#        gradinput3 = torch.zeros(self.input3.size())
#        gradflow_weight1 = torch.zeros(self.flow_weight1.size())
#        
#        if self.input1.is_cuda:
#            # print("CUDA backward")
#            #gradinput1 = gradinput1.cuda(self.device)
#            #gradinput2 = gradinput2.cuda(self.device)
#            gradinput3 = gradinput3.cuda(self.device)
#            gradflow_weight1 = gradflow_weight1.cuda(self.device)
#
#            err = my_lib.ReliableValueLayer_gpu_backward(
#                     self.input3, self.flow_weight1, gradoutput, 
#                     gradinput3,    gradflow_weight1,                        
#                    self.sigma_d,    self.tao_r ,  self.Prowindow )
#            if err != 0 :
#                print(err)
#
#        else: 
#            # print("CPU backward")
#            # print(gradoutput)
#            err = my_lib.ReliableValueLayer_cpu_backward(
#                    self.input3,self.flow_weight1, gradoutput, 
#                    gradinput3,    gradflow_weight1,        
#                    self.sigma_d,    self.tao_r ,  self.Prowindow )
#            # print(err)
#            if err != 0 :
#                print(err)
#            # print(gradinput1)
#            # print(gradinput2)
#
#        # print(gradinput1)
#
#        return gradinput3,gradflow_weight1    
class ReliableWeightLayer(Function):
    def __init__(self, threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):
        super(ReliableWeightLayer,self).__init__()

        self.threshhold = threshhold
        self.sigma_d = sigma_d
        self.tao_r = tao_r #maybe not useable
        self.Prowindow = Prowindow

    def forward(self, input3):

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        #self.input1 = input1.contiguous() # ref1 image
        #self.input2 = input2.contiguous() # ref2 image
        self.input3 = input3.contiguous() # ref1 flow
        #self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights

        if input3.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        output =  torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)] )
        #output2 =  torch.zeros(input1.size())
        #weight1 =  torch.zeros(input1.size())
        #weight2 =  torch.zeros(input1.size())

        if input3.is_cuda :
            output = output.cuda()            
            err = my_lib.ReliableWeightLayer_gpu_forward(
                input3, output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.ReliableWeightLayer_cpu_forward(
                input3, output,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)
        self.output= output # used for inihibiting some unreliable gradients.
        # the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        #print("Backward of ReliableWeightLayer Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        #gradinput1 = torch.zeros(self.input1.size())
        #gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        #gradflow_weight1 = torch.zeros(self.flow_weight1.size())
        
        if self.input3.is_cuda:
            #print("CUDA backward")
            #gradinput1 = gradinput1.cuda(self.device)
            #gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)
            #gradflow_weight1 = gradflow_weight1.cuda(self.device)

            err = my_lib.ReliableWeightLayer_gpu_backward(
                 self.input3,   self.output,
                 gradoutput,
                 gradinput3,
                 self.threshhold,
                 self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            err = my_lib.ReliableWeightLayer_cpu_backward(
                self.input3, self.output,
                gradoutput,
                gradinput3,
                self.threshhold,
                self.sigma_d,    self.tao_r ,  self.Prowindow
            )
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)
        # print("from 4:")
        # print(gradinput3[0,0,...])

        return gradinput3

================================================
FILE: my_package/FilterInterpolation/FilterInterpolationModule.py
================================================
# modules/AdaptiveInterpolationLayer.py
from torch.nn import Module
import torch
from torch.autograd import Variable
from torch.autograd import gradcheck
from .FilterInterpolationLayer import FilterInterpolationLayer,WeightLayer, PixelValueLayer,PixelWeightLayer,ReliableWeightLayer

class FilterInterpolationModule(Module):
    def __init__(self):
        super(FilterInterpolationModule, self).__init__()
        # self.f = FilterInterpolationLayer()

    def forward(self, input1, input2, input3):
        return FilterInterpolationLayer.apply(input1, input2, input3)

    #we actually dont need to write the backward code for a module, since we have

#class WeightModule(Module):
#    def __init__(self):
#        super(WeightModule, self).__init__()
#        self.f = WeightLayer()
#
#    def forward(self, input1, input2, input3):
#        return self.f(input1, input2, input3)
class AdaptiveWeightInterpolationModule(Module):
    def __init__(self,  training = False, threshhold = 1e-6,
                 lambda_e = 30.0/255.0, lambda_v = 1.0, Nw = 3.0,
                 sigma_d =1.5,  tao_r = 0.05, Prowindow = 2 ):
        super(AdaptiveWeightInterpolationModule, self).__init__()

        self.calc_weight1 = WeightLayer(lambda_e, lambda_v, Nw )
        self.padder1 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])
        self.interpolate1 = PixelValueLayer(sigma_d, tao_r , Prowindow)
        self.interpolate1_1 = PixelWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)
        #        self.interpolate_R1 = ReliableValueLayer(Nw, tao_r , Prowindow)
        self.interpolate_R1_1 = ReliableWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)
        
        self.calc_weight2 = WeightLayer(lambda_e, lambda_v,Nw)
        self.padder2 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])
        self.interpolate2 = PixelValueLayer(sigma_d, tao_r , Prowindow )
        self.interpolate2_1 = PixelWeightLayer(101*threshhold,sigma_d,tao_r, Prowindow)
        #self.interpolate_R2 = ReliableValueLayer(Nw, tao_r , Prowindow)
        self.interpolate_R2_1 = ReliableWeightLayer(101*threshhold, sigma_d,tao_r, Prowindow)

        self.training = training
        self.threshold = threshhold
        return
        #self.lambda_e = lambda_e
        #self.lambda_v = lambda_v
        #self.sigma_d = sigma_d
        #self.Nw = Nw
        #self.tao_r = tao_r #maybe not useable
        #self.Prowindow = Prowindow
        #    lambda_e = self.lambda_e , lambda_v = self.lambda_v,Nw = self.Nw
        #    sigma_d = self.sigma_d,  tao_r = self.tao_r , Prowindow = self.Prowindow 
        #self.sigma_d,    self.tao_r ,  self.Prowindow 


    # input1 ==> ref1 image
    # #input2 ==> ref2 image
    # input3 ==> ref1 flow
    # input4 ==> ref2 flow
    def forward(self, input1, input2, input3, input4):
        epsilon = 1e-6
        #flow1_grad = torch.sum(torch.sqrt(
        #                    (input3[:, :, :-1, :-1] - input3[:, :, 1:, :-1]) ** 2 +
        #                    (input3[:, :, :-1, :-1] - input3[:, :, :-1, 1:]) ** 2 + epsilon * epsilon
        #                ), dim = 1,keepdim =True)
        #flow1_grad = self.padder1(flow1_grad)
        # if input1.is_cuda:
        #     err = gradcheck(self.calc_weight1,(Variable(input1.data,requires_grad=True),
        #                                        Variable(input2 .data,requires_grad=True),
        #                                        Variable(input3.data,requires_grad= True),
        #                                         # Variable(flow1_grad.data,requires_grad=True)
        #                                        ), eps=1e-3)
        #     print(err)
            # pass
            #input1.requires_grad = True
            #input2.requires_grad = True

        flow_weight1 = self.calc_weight1(input1,input2,input3 )
        # if flow1_grad.is_cuda:
            # err = gradcheck(self.interpolate1,(Variable(input1.data,requires_grad=True),
            #                                    Variable(input3.data,requires_grad= True),
            #                                     Variable(flow_weight1.data,requires_grad=True)), eps=1e-3)
            # err = gradcheck(self.interpolate1_1, (Variable(input3.data,requires_grad=True),
            #                                       Variable(flow_weight1.data, requires_grad =True)),eps=1e-3)
            # err = gradcheck(self.interpolate_R1_1,(input3,),eps=1e-3)
            # print(err)
        # print(flow_weight1[0,:,50:100,50:100])
        p1 = self.interpolate1(input1, input3, flow_weight1)
        p1_r,p1_g,p1_b = torch.split(p1,1,dim=1)
        pw1 = self.interpolate1_1(input3, flow_weight1)
        i1_r,i1_g,i1_b = (p1_r)/(pw1+self.threshold),\
                         (p1_g)/(pw1+self.threshold), \
                         (p1_b)/(pw1+self.threshold)
        #if not self.training:
        #    i1_r[pw1<=10*self.threshold], i1_g[pw1<=10*self.threshold], i1_b[pw1<=10*self.threshold] = 0,0,0
        #i1 = torch.cat((i1_r,i1_g,i1_b),dim=1
        #r1 = self.interpolate_R1(input3, flow_weight1)
        r1 = pw1
        rw1 = self.interpolate_R1_1(input3)
        w1 = (r1)/(rw1+self.threshold)
        # if torch.sum(w1 <= 0).cpu().data.numpy()[0] > 0:
        #   pass
            # print("there are holes in i1 :" )
            # print(torch.sum(w1 <= 0))
        #if not self.training:
        #    w1[rw1 <=10*self.threshold] = 0

        # flow2_grad = torch.sum(torch.sqrt(
        #                     (input4[:, :, :-1, :-1] - input4[:, :, 1:, :-1]) ** 2 +
        #                     (input4[:, :, :-1, :-1] - input4[:, :, :-1, 1:]) ** 2 + epsilon * epsilon
        #                 ), dim = 1,keepdim=True)
        # flow2_grad = self.padder2(flow2_grad)

        flow_weight2 = self.calc_weight2(input2,input1,input4)
        p2 = self.interpolate2(input2, input4, flow_weight2)
        p2_r,p2_g,p2_b = torch.split(p2,1,dim=1)
        pw2 = self.interpolate2_1(input4, flow_weight2)
        i2_r,i2_g,i2_b = (p2_r)/(pw2+self.threshold),\
                         (p2_g)/(pw2+self.threshold), \
                         (p2_b)/(pw2+self.threshold)
        #if not self.training:
        #    i2_r[pw2<=10*self.threshold], i2_g[pw2<=10*self.threshold], i2_b[pw2<=10*self.threshold] = 0,0,0
        #i2 = torch.cat((p2[:,0,...] /pw2, p2[:,1,...] /pw2, p2[:,2,...]/pw2),dim=1)
        #r2 = self.interpolate_R2(input4, flow_weight2)
        r2 = pw2
        rw2 = self.interpolate_R2_1(input4)
        w2 = (r2)/(rw2+self.threshold)
        #if torch.sum(w2 <= 0).cpu().data.numpy()[0] > 0:
        #    pass
        #    print("there are holes in i2 :" )
        #    print(torch.sum(w2 <= 0))
        #if not self.training:
        #    w2[rw2 <= 10*self.threshold] = 0
        # i = (i1 * w1 + i2 * w2 )/ (w1 + w2)

        w = w1+w2
        i_r = (i1_r * w1 + i2_r * w2)/ (w + self.threshold) #(w1 + w2)
        i_g = (i1_g * w1 + i2_g * w2)/ (w + self.threshold) #(w1 + w2)
        i_b = (i1_b * w1 + i2_b * w2)/ (w + self.threshold) #(w1 + w2)
        #if torch.sum(w <= 0).cpu().data.numpy()[0] > 0:
        #    print("there are holes in i :")
        #    print(torch.sum(w <= 0))
        if not self.training:
            i_r[w<= 10*self.threshold], i_g[w<=10*self.threshold], i_b[w<=10*self.threshold] = 0,0,0
            w[w <= 10 *self.threshold] = 0
        i = torch.cat((i_r,i_g,i_b),dim=1)
        return i


================================================
FILE: my_package/FilterInterpolation/__init__.py
================================================
from .FilterInterpolationModule import *


================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "filterinterpolation_cuda_kernel.cuh"


int FilterInterpolationLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		at::Tensor&  output

		)
		{
	int error = 1 ;

	int channel = input1.size( 1);
	//if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;

    int filter_size2 = input3.size( 1);
    int filter_size = (int) sqrt((float) filter_size2);
//    printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2));


	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);
//    printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
    if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);


	error = FilterInterpolationLayer_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch, filter_size,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,


			input1,
			input2,
			input3,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

		}
int FilterInterpolationLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2,
		at::Tensor&  gradinput3
		)
		{


    int error = 1 ;
	int channel = input1.size( 1);
	//if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;


    int filter_size2 = input3.size( 1);
    int filter_size = (int) sqrt((float) filter_size2);
//    printf("filter size is: %d,or %f", filter_size, sqrt((float)filter_size2));

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);
//    printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
	if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
    if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input2_b_stride != gradinput2.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;
	if(input2_c_stride != gradinput2.stride(1)) return error;
	if(input3_c_stride != gradinput3.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = FilterInterpolationLayer_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch, filter_size,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,

			input1,
			input2,
			input3,
			gradoutput,
			gradinput1,
			gradinput2,
			gradinput3
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("FilterInterpolationLayer_gpu_forward", &FilterInterpolationLayer_gpu_forward, "FilterInterpolation forward (CUDA)");
  m.def("FilterInterpolationLayer_gpu_backward", &FilterInterpolationLayer_gpu_backward, "FilterInterpolation backward (CUDA)");
}


================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "filterinterpolation_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void FilterInterpolationLayer_gpu_forward_kernelfunc(
		const int nElement,
		const int w, 		const int h, 		const int channel, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		const scalar_t* __restrict__    input1,    		const scalar_t* __restrict__    input2,    	const scalar_t* __restrict__    input3, 	scalar_t*   output

		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;


	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {

		float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];
		float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];

		float x2 = (float)(w_i) + fx;
		float y2 = (float)(h_i) + fy;


		if(x2 >= 0.0f && y2 >=0.0f && x2 <= (float)(w -1) && y2 <= (float)(h-1)
            && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){
			int ix2_L = int(x2) + 1 - (int)(filter_size / 2);
			int iy2_T = int(y2) + 1 - (int)(filter_size / 2);
			int ix2_R = ix2_L + filter_size;
			int iy2_B = iy2_T + filter_size;

            float alpha = x2 - (int)(x2);
            float beta = y2 - (int)(y2);


			//TODO: here is a bug that if the iy2_B or ix2_R gets out of the border, than there is no enough pixels to warp the target one.
			for (int c_i = 0 ; c_i < channel ; c_i++){

                float TL = 0.0f;
                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
                    int _filter_j = min(max(0, filter_j), h - 1);
                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i ), w - 1);
                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *
							input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
                    }
                }

                float TR = 0.0f;
                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BL = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BR = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] =
                            (1-alpha)*(1-beta)*TL +
							alpha*(1-beta)*TR +
							(1-alpha)*beta*BL +
							alpha*beta*BR;

//					for( int filter_i = ix2_L; filter_i < ix2_R ; filter_i ++ ){
//						int _filter_i = min(max(0, filter_i),w - 1);
//						output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] +=
//							input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *
//							input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] *
////							exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)); // the distance weight
//							exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ); // the distance weight
//
////							if(w_i == 141 && h_i == 316 && c_i == 0 ){
////printf("gpu: %f, %f,%f,%f\n",input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] ,
////input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i],
////exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)),
////output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ]
//// );
////}
//
//					}
//				}
			}
		} else{
			//the warping data is out of range, we fill it with zeros
			for(int c_i = 0 ;  c_i < channel; c_i ++){
				output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = input1[off + c_i* input1_c_stride+ h_i * input1_h_stride + w_i];
			}
		}
	}
	return ;

}


template <typename scalar_t>
__global__ void FilterInterpolationLayer_gpu_backward_kernelfunc(
		const int nElement, 	   const int w, 		const int h, 		const int channel, 	const int filter_size,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		const scalar_t* __restrict__      input1,        		const scalar_t* __restrict__      input2,		const scalar_t* __restrict__      input3,
		scalar_t* gradoutput,    		scalar_t*  gradinput1,  		scalar_t*  gradinput2,  		scalar_t*  gradinput3
		)
		{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){

		float fx = input2[batch_i * input2_b_stride +  0 * input2_c_stride + h_i * input2_h_stride + w_i];
		float fy = input2[batch_i * input2_b_stride +  1 * input2_c_stride + h_i * input2_h_stride + w_i];

		float x2 = float(w_i) + fx;
		float y2 = float(h_i) + fy;

		if(x2 >= 0.0f  && y2 >= 0.0f && x2 <= (float)(w - 1) && y2 <= (float)(h -1)
            && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){
			int ix2_L = int(x2) + 1 - (int) (filter_size/2);
			int iy2_T = int(y2) + 1 - (int) (filter_size/2);
			int ix2_R = ix2_L + filter_size;
			int iy2_B = iy2_T + filter_size;

            float alpha = x2 - (int)(x2);
            float beta = y2  - (int)(y2);
			/***
			  Step 1: calculate the gradients for input1, i.e. the input image;
			 ***/
            /***
              STEP 3: calculate the gradients for input3, i.e. the filter
             ***/
             /***
                Step 1 and Step 3 are simultaneously computed
             ***/
			for (int c_i = 0 ; c_i < channel; c_i++){

				float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];

                float TL_grad = gradoutput_value * (1-alpha ) * (1-beta);
                for(int filter_j = iy2_T; filter_j <= (int) (y2) ; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j), h - 1);
                    for (int filter_i = ix2_L   ; filter_i <= (int)(x2) ; filter_i ++){
                    int _filter_i = min(max(0, filter_i), w - 1);
                    atomicAdd( &gradinput1[off +c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
                                TL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
                                                                input3_c_stride + h_i * input3_h_stride + w_i]);
                    atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
                                                                        input3_c_stride + h_i * input3_h_stride + w_i],
                                TL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);

                    }
                }

                float TR_grad= gradoutput_value * alpha * ( 1- beta);
                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1

                    atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
                                TR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
                                                                input3_c_stride + h_i * input3_h_stride + w_i]);
                    atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
                                                                        input3_c_stride + h_i * input3_h_stride + w_i],
                                TR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);

                    }
                    }

                   float BL_grad = gradoutput_value * ( 1 - alpha ) * beta;
                   for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                        int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                        for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
                            int _filter_i = min(max(0, filter_i),w - 1);// only used for input1

                        atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
                                    BL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
                                                                    input3_c_stride + h_i * input3_h_stride + w_i]);
                        atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
                                                                            input3_c_stride + h_i * input3_h_stride + w_i],
                                    BL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);

                    }
                    }

                float BR_grad = gradoutput_value * alpha * beta;
                 for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                    for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
                        int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                        atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],
                                    BR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *
                                                                    input3_c_stride + h_i * input3_h_stride + w_i]);
                        atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *
                                                                            input3_c_stride + h_i * input3_h_stride + w_i],
                                    BR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);
                        }
                }
//				for ( int filter_j = iy2_T; filter_j < iy2_B ; filter_j ++ ){
//					int _filter_j = min(max(0, filter_j),  h - 1);
//					for( int filter_i = ix2_L; filter_i< ix2_R ; filter_i++){
//						int _filter_i = min(max(0,filter_i), w - 1);
//						atomicAdd( & gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i],
//								gradoutput_value *
//								input3 [batch_i * input3_b_stride + ((filter_j  - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i] *
////								exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2))/(float)filter_size)
//                                exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2)))
//
//							 );
//					}
//				}

			}

			/***
			  Step 2: calculate the gradients for input2, i.e., the optical flow,
			  STEP 2.1: for the x/horizonotal direction.
			 ***/
            float gamma  =  1.0f - beta; //iy2_B - y2;
			float bot_diff = 0.0f;
			for(int c_i =0 ; c_i< channel; c_i ++ ){
				float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];

    float TL = 0.0f;
                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
                    int _filter_j = min(max(0, filter_j), h - 1);
                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i ), w - 1);
                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *
							input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
                    }
                }

                float TR = 0.0f;
                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BL = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BR = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

	            float temp = 0.0f;
                temp += gamma * (TR - TL);
                temp += (1-gamma) * (BR - BL);
                bot_diff += gradoutput_value * temp;
//				for( int filter_j = iy2_T; filter_j< iy2_B; filter_j++){
//					int _filter_j = min(max(0, filter_j) , h - 1);
//					for( int filter_i = ix2_L; filter_i< ix2_R; filter_i ++){
//						int _filter_i = min(max(0,filter_i), w-1);
//
//						bot_diff +=
//							gradoutput_value *
//							input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
//							input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i   ] *
////							exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size) *
////							((float) filter_i > x2 ? 1.0f : -1.0f) / (float)filter_size;
//                        	exp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))) *
//							((float) filter_i > x2 ? 1.0f : -1.0f);
//					}
//				}
			}
			//the gradients of the x direction/ horizontal direction
			gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;

			/***
			  STEP 2.2: for the x/horizonotal direction.
			 ***/
            gamma =  1.0f - alpha; //ix2_R -x2;
			bot_diff = 0.0f;
			for(int c_i = 0 ; c_i < channel; c_i ++ ){
				float gradoutput_value = gradoutput [ off + c_i * input1_c_stride + h_i * input1_h_stride +w_i];

                float TL = 0.0f;
                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){
                    int _filter_j = min(max(0, filter_j), h - 1);
                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i ), w - 1);
                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *
							input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;
                    }
                }

                float TR = 0.0f;
                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BL = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float BR = 0.0f;
                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){
                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1
                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){
                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1
                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];
                }
                }

                float temp = 0.0f;
                temp += gamma * (BL - TL);
                temp += (1.0f - gamma) * ( BR - TR);
                bot_diff += gradoutput_value * temp;

//				for( int filter_j = iy2_T; filter_j < iy2_B; filter_j ++ ){
//					int _filter_j = min(max(0, filter_j), h - 1);
//					for( int filter_i = ix2_L; filter_i < ix2_R; filter_i ++){
//						int _filter_i = min(max(0, filter_i), w - 1);
//
//						bot_diff +=
//							gradoutput_value *
//							input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
//							input3 [batch_i * input3_b_stride +((filter_j - iy2_T) * filter_size + ( filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i ] *
////							exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))/ (float)filter_size  ) *
////							((float) filter_j > y2 ? 1.0f : - 1.0f ) / (float)filter_size;
//							exp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))  ) *
//							((float) filter_j > y2 ? 1.0f : - 1.0f );
//					}
//				}
			}
			gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;
			/***
			  STEP 3: calculate the gradients for input3, i.e. the filter
			 ***/
//			for(int c_i  = 0 ; c_i <channel ; c_i ++ ){
//				float gradoutput_value = gradoutput[ off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ];
//				for( int filter_j=  iy2_T ; filter_j < iy2_B; filter_j ++ ){
//					int _filter_j = min(max(0, filter_j), h -1 );
//					for ( int filter_i  = ix2_L; filter_i < ix2_R; filter_i ++ ){
//						int _filter_i  = min(max(0, filter_i ), w - 1);
//
//						gradinput3 [  batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L  ) ) * input3_c_stride + h_i * input3_h_stride + w_i] +=
//							gradoutput_value *
//							input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *
////							exp( -(fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size);
//							exp( -(fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2)));
//					}
//				}
//			}
		}
	}
	return ;

}


int FilterInterpolationLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const  int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		at::Tensor&  input1,    		at::Tensor&  input2,    	at::Tensor&  input3, 	at::Tensor&  output

		)
{
	int error = 1 ;

	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	//extract the data of CudaTensor and use kernel to calculate.
		AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
FilterInterpolationLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), output.data<scalar_t>()
			);
 					}));

	//			THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}

int FilterInterpolationLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch,    		const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		at::Tensor&  input1,        		at::Tensor&  input2,		at::Tensor&  input3,

		at::Tensor&  gradoutput,    		at::Tensor&  gradinput1,  		at::Tensor&  gradinput2,  		at::Tensor&  gradinput3
		)
{

	int error = 1 ;

	dim3 grid;
	dim3 block;


	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);

//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));

			AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
FilterInterpolationLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,


			input1.data<scalar_t>(), 			input2.data<scalar_t>(),         input3.data<scalar_t>(),  			gradoutput.data<scalar_t>(),
			gradinput1.data<scalar_t>(), 			gradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()
			);
 					}));

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


================================================
FILE: my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int FilterInterpolationLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const  int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		at::Tensor& input1,    		at::Tensor& input2,    	at::Tensor& input3, 	at::Tensor& output

		);

int FilterInterpolationLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch,    		const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,

		at::Tensor& input1,        		at::Tensor& input2,		at::Tensor& input3,

		at::Tensor& gradoutput,    		at::Tensor& gradinput1,  		at::Tensor& gradinput2,  		at::Tensor& gradinput3
		);


================================================
FILE: my_package/FilterInterpolation/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='filterinterpolation_cuda',
    ext_modules=[
        CUDAExtension('filterinterpolation_cuda', [
            'filterinterpolation_cuda.cc',
            'filterinterpolation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/FlowProjection/FlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import flowprojection_cuda as my_lib

#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class FlowProjectionLayer(Function):
    def __init__(self,requires_grad):
        super(FlowProjectionLayer,self).__init__()
        self.requires_grad = requires_grad

    @staticmethod
    def forward(ctx, input1, requires_grad):
        assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it

        fillhole = 1 if requires_grad == False else 0
        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
        # output = torch.zeros(input1.size())

        if input1.is_cuda :
            # output = output.cuda()
            # count = count.cuda()
            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.FlowProjectionLayer_gpu_forward(input1, count,output, fillhole)
        else:
            output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.FlowProjectionLayer_cpu_forward(input1, count, output, fillhole)
        if err != 0:
            print(err)
        # output = output/count # to divide the counter

        ctx.save_for_backward(input1, count)
        ctx.fillhole = fillhole
        # self.count = count #to keep this
        # print(self.input1[0, 0, :10, :10])
        # print(self.count[0, 0, :10, :10])
        # print(self.input1[0, 0, -10:, -10:])
        # print(self.count[0, 0, -10:, -10:])

        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Filter Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())

        input1, count, output = ctx.saved_tensors

        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.FlowProjectionLayer_gpu_backward(input1, count, gradoutput, gradinput1)
            # print(err)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.FlowProjectionLayer_cpu_backward(input1, count,  gradoutput, gradinput1)
            # print(err)
            if err != 0:
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, None

class FlowFillholelayer(Function):
    def __init__(self):
        super(FlowFillholelayer,self).__init__()

    def forward(self, input1):
        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it

        if input1.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
        output = torch.zeros(input1.size())

        if input1.is_cuda :
            output = output.cuda()
            # count = count.cuda()
            err = my_lib.FlowFillholelayer_gpu_forward(input1, output)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.FlowFillholelayer_cpu_forward(input1, output)
        if err != 0:
            print(err)
        # output = output/count # to divide the counter

        # self.count = count #to keep this
        # print(self.input1[0, 0, :10, :10])
        # print(self.count[0, 0, :10, :10])
        # print(self.input1[0, 0, -10:, -10:])
        # print(self.count[0, 0, -10:, -10:])

        # the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    # def backward(self, gradoutput):
    #     # print("Backward of Filter Interpolation Layer")
    #     # gradinput1 = input1.new().zero_()
    #     # gradinput2 = input2.new().zero_()
    #     gradinput1 = torch.zeros(self.input1.size())
    #     if self.input1.is_cuda:
    #         # print("CUDA backward")
    #         gradinput1 = gradinput1.cuda(self.device)
    #         err = my_lib.FlowProjectionLayer_gpu_backward(self.input1, self.count, gradoutput, gradinput1)
    #         # print(err)
    #         if err != 0 :
    #             print(err)
    #
    #     else:
    #         # print("CPU backward")
    #         # print(gradoutput)
    #         err = my_lib.FlowProjectionLayer_cpu_backward(self.input1, self.count,  gradoutput, gradinput1)
    #         # print(err)
    #         if err != 0:
    #             print(err)
    #         # print(gradinput1)
    #         # print(gradinput2)
    #
    #     # print(gradinput1)
    #
    #     return gradinput1

================================================
FILE: my_package/FlowProjection/FlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn import Module
from .FlowProjectionLayer import FlowProjectionLayer #, FlowFillholeLayer

class FlowProjectionModule(Module):
    def __init__(self, requires_grad = True):
        super(FlowProjectionModule, self).__init__()

        self.f = FlowProjectionLayer(requires_grad)

    def forward(self, input1):
        return self.f(input1)

# class FlowFillholeModule(Module):
#     def __init__(self,hole_value = -10000.0):
#         super(FlowFillholeModule, self).__init__()
#         self.f = FlowFillholeLayer()
#
#     def forward(self, input1):
#         return self.f(input1)

    #we actually dont need to write the backward code for a module, since we have


================================================
FILE: my_package/FlowProjection/__init__.py
================================================
from  .FlowProjectionModule import *

================================================
FILE: my_package/FlowProjection/flowprojection_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "flowprojection_cuda_kernel.cuh"

int FlowProjectionLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  count,
		at::Tensor&  output,
		int fillhole
		)
{

	int error = 1 ;

	int channel = input1.size( 1);
	if(channel!= 2) return error;
	int batch = input1.size(0);

	int h = input1.size(2);
	int w = input1.size(3);

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);
//    printf("In gpu forward\n");
	error = FlowProjection_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,fillhole,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
			count,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}

int FlowProjectionLayer_gpu_backward(
		at::Tensor&  input1,
        at::Tensor&  count,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1
		)
{
	int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=2) return error;
	int batch = input1.size(0);
	if(count.size(0) != batch) return error;
	if(count.size(1) != 1) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(count.size(2) != h) return error;// to add some checkpoint
	if(count.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
//    printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = FlowProjection_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
			count,
			gradoutput,
			gradinput1
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("FlowProjectionLayer_gpu_forward", &FlowProjectionLayer_gpu_forward, "FlowProjection forward (CUDA)");
  m.def("FlowProjectionLayer_gpu_backward", &FlowProjectionLayer_gpu_backward, "FlowProjection backward (CUDA)");
}


================================================
FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cu
================================================
#include <stdio.h>

#include "flowprojection_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void FlowProjection_gpu_forward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__    input1,
		scalar_t*  count,
		scalar_t*  output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];

        float x2 = (float) (w_i) + fx;
        float y2 = (float) (h_i) + fy;
        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
            int ix2_L = (int) (x2);
            int iy2_T = (int) (y2);
            int ix2_R = min(ix2_L + 1, w - 1);
            int iy2_B = min(iy2_T + 1, h - 1);

            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,-fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-fx);
            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-fx);

            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]  , -fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  , -fy);
            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  , -fy);

            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], 1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] , 1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , 1);
            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] , 1);
        }
	}
	return ;

}
template <typename scalar_t>
__global__ void FlowProjectionAveraging_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__      input1,
		scalar_t* count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
	    float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
        if(temp > 0.0f){
            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void FlowFillhole_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__ input1,
		scalar_t*	count,
		scalar_t*	output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
	    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
        if(temp <= 0.0f){
            //search along the four directions,0/90/180/270, until finding at least one
            int left_offset = w_i;            float left_temp = 0.0f;
            while(left_temp == 0.0f && left_offset - 1 >= 0){
                left_offset = left_offset - 1;
                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
            }

            int right_offset = w_i ;            float right_temp = 0.0f;
            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
                right_offset  = right_offset + 1 ;
                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
            }

            int up_offset = h_i ;            float up_temp = 0.0f;
            while(up_temp == 0.0f && up_offset - 1 >=0){
                up_offset = up_offset - 1;
                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
            }

            int down_offset = h_i;            float down_temp = 0.0f;
            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
                down_offset = down_offset + 1;
                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
            }

            if(left_temp + right_temp + up_temp + down_temp <=0.0f){
                //printf("Can't fill hole, find no neighbor vectors availabel\n");
                return;
            }

            left_temp = (left_temp > 0.0f)?1:0;
            right_temp = (right_temp > 0.0f)?1:0;
            up_temp = (up_temp > 0.0f)?1:0;
            down_temp = (down_temp > 0.0f)?1:0;

            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;


            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;
        }
	}
	return ;

}
template <typename scalar_t>
__global__ void FlowProjection_gpu_backward_kernelfunc(
		const int nElement,  	const int w, 	const int h, const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__        input1,
		const scalar_t* __restrict__       count,
		const scalar_t* __restrict__       gradoutput,
		scalar_t*   gradinput1
		)
{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){
        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;

        float x2 = (float) ( w_i ) + fx;
        float y2 = (float) ( h_i ) + fy;
        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
            int ix2_L = (int)(x2);
            int iy2_T = (int)(y2);
            int ix2_R  = min(ix2_L + 1, w-1);
            int iy2_B  = min(iy2_T + 1, h-1);

            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
            gradinput1[iu_offset] += -  gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/
                                        count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L]  ;
            gradinput1[iu_offset] += -    gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]/
                                         count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R]          ;
            gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/
                                         count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]  ;
            gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/
                                         count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R]   ;

            int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/
                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L]  ;
            gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]/
                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]  ;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/
                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]     ;
            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/
                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R]   ;
        }
	}
	return ;

}


int FlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,
		at::Tensor&  count,
		at::Tensor&  output
		)
{
    int error = 1 ;


	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
//    printf("I am here\n");
	//extract the data of CudaTensor and use kernel to calculate.

	AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_forward_kernelfunc", ([&] {
	FlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
			);
								}));

    cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am there\n");
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjectionAveraging_kernelfunc", ([&] {

    FlowProjectionAveraging_kernelfunc<<<grid,block,0,stream>>>(
    		nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
    );				
	}));

//    printf("I am kao\n");

	//			THCudaCheck(cudaGetLastError());
    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am dd\n");

    if(fillhole){

//        printf("use flow fill hole\n");
    	AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowFillhole_kernelfunc", ([&] {
    FlowFillhole_kernelfunc<<<grid,block,0,stream>>>(
    		nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
        );
					}));

    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		return error;
	}

    }

	error = 0;
	return error;

}


int FlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,
		at::Tensor&  count,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1
		)
{

	int error = 1 ;

	dim3 grid;
	dim3 block;

	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
    	AT_DISPATCH_FLOATING_TYPES(input1.type(), "FlowProjection_gpu_backward_kernelfunc", ([&] {
	FlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),
			count.data<scalar_t>(),
			gradoutput.data<scalar_t>(),
			gradinput1.data<scalar_t>()
			);
		}));

//    printf("gpu I am there\n");

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("gpu I am here\n");

	error = 0;
	return error;


}


================================================
FILE: my_package/FlowProjection/flowprojection_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int FlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor& input1,
		at::Tensor& count,
		at::Tensor& output

		);

int FlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor& input1,
		at::Tensor& count,
		at::Tensor& gradoutput,
		at::Tensor& gradinput1
		);


================================================
FILE: my_package/FlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='flowprojection_cuda',
    ext_modules=[
        CUDAExtension('flowprojection_cuda', [
            'flowprojection_cuda.cc',
            'flowprojection_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/Interpolation/InterpolationLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import interpolation_cuda as my_lib

#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class InterpolationLayer(Function):
    def __init__(self):
        super(InterpolationLayer,self).__init__()

    @staticmethod
    def forward(ctx, input1,input2):

        assert(input1.is_contiguous())
        assert(input2.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # output =  torch.zeros(input1.size())


        if input1.is_cuda :
            # output = output.cuda()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            my_lib.InterpolationLayer_gpu_forward(input1, input2, output)
        else:
            output = torch.cuda.FloatTensor(input1.data.size())
            my_lib.InterpolationLayer_cpu_forward(input1, input2, output)
        ctx.save_for_backward(input1, input2)

        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())
        # gradinput2 = torch.zeros(self.input2.size())
        input1, input2 = ctx.saved_tensors

        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            # gradinput2 = gradinput2.cuda(self.device)
            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()

            # the input1 image should not require any gradients
            # print("Does input1 requires gradients? " + str(self.input1.requires_grad))

            err = my_lib.InterpolationLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)
            if err != 0 :
                print(err)
        else:
            # print("CPU backward")
            # print(gradoutput)
            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
            err = my_lib.InterpolationLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)
            # print(err)
        if err != 0 :
            print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, gradinput2

================================================
FILE: my_package/Interpolation/InterpolationModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from .InterpolationLayer import InterpolationLayer

class InterpolationModule(Module):
    def __init__(self):
        super(InterpolationModule, self).__init__()
        # self.f = InterpolationLayer()

    def forward(self, input1, input2):
        return InterpolationLayer.apply(input1, input2)

    #we actually dont need to write the backward code for a module, since we have 


================================================
FILE: my_package/Interpolation/__init__.py
================================================
from  .InterpolationModule import *

================================================
FILE: my_package/Interpolation/interpolation_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "interpolation_cuda_kernel.cuh"


int InterpolationLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  output
		)
		{
	int error = 1 ;

	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);

	error =InterpolationLayer_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(),
			nElement,w,h,channel,batch,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1,
			input2,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}


int InterpolationLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
    {
	int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input2_b_stride != gradinput2.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;
	if(input2_c_stride != gradinput2.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = InterpolationLayer_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1,
			input2,
			gradoutput,
			gradinput1,
			gradinput2
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("InterpolationLayer_gpu_forward", &InterpolationLayer_gpu_forward, "Interpolation forward (CUDA)");
  m.def("InterpolationLayer_gpu_backward", &InterpolationLayer_gpu_backward, "Interpolation backward (CUDA)");
}


================================================
FILE: my_package/Interpolation/interpolation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "interpolation_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void InterpolationLayer_gpu_forward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		const scalar_t* __restrict__    input1,
		const scalar_t* __restrict__    input2,
		scalar_t*   output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {

		float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];
		float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];

		float x2 = (float)(w_i) + fx;
		float y2 = (float)(h_i) + fy;

		if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){
			int ix2_L = int(x2);
			int iy2_T = int(y2);
			int ix2_R = min(ix2_L + 1, w - 1);
			int iy2_B = min(iy2_T + 1, h - 1);

			float alpha = x2 - ix2_L;
			float beta = y2 - iy2_T;

			for(int c_i = 0 ; c_i < channel ; c_i ++){
				float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
				float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];
				float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];
				float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
				output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =
					(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;
			}
		} else{
			//the warping data is out of range, we fill it with zeros
			for(int c_i = 0 ;  c_i < channel; c_i ++){
				output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;
			}
		}
	}

	return ;

}
 

template <typename scalar_t>
__global__ void InterpolationLayer_gpu_backward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		const scalar_t* __restrict__    input1,
		const scalar_t* __restrict__    input2,
		const scalar_t* __restrict__    gradoutput,
		scalar_t*  gradinput1,
		scalar_t*  gradinput2
		)
{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){

		float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
		float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];

		float x2 = float(w_i) + fx;
		float y2 = float(h_i) + fy;

		if(x2 >= 0.0f  && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){
			int ix2_L = int(x2);
			int iy2_T = int(y2);

			int ix2_R  = min(ix2_L+ 1, w - 1);
			int iy2_B = min(iy2_T + 1, h - 1);

			float alpha = x2 - ix2_L;
			float beta = y2 - iy2_T;

			for (int c_i = 0 ; c_i < channel; c_i++){
				float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];

				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);

			}

			float gamma  = iy2_B - y2;

			float bot_diff = 0.0f;
			for(int c_i =0 ; c_i< channel; c_i ++ ){
				float temp = 0;
				temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -
						input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);
				temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -
						input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);

				float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];
				bot_diff += warped_diff_value * temp  ;


			}
			//the gradients of the x direction/ horizontal direction
			gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;

			gamma = ix2_R- x2;
			bot_diff = 0.0f;
			for(int c_i = 0 ; c_i < channel;c_i ++ ){
				float temp = 0.0f;
				temp += gamma    * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -
						input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);

				temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -
						input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);

				float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
				bot_diff += warped_diff_value * temp;


			}
			gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;

		}


	}
	return ;

}
int InterpolationLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  output
		)
{
	int error = -1;


	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	//extract the data of CudaTensor and use kernel to calculate.
		AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {

	InterpolationLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),output.data<scalar_t>()
			);
	}));

	//			THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}

int InterpolationLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{
	int error = -1;

	dim3 grid;
	dim3 block;

	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_forward", ([&] {
	InterpolationLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1.data<scalar_t>(),
			input2.data<scalar_t>(),
			gradoutput.data<scalar_t>(),
			gradinput1.data<scalar_t>(),
			gradinput2.data<scalar_t>()
			);
	}));

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


================================================
FILE: my_package/Interpolation/interpolation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int InterpolationLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor& input1,
		at::Tensor& input2,
		at::Tensor& output

		);

int InterpolationLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor& input1,
		at::Tensor& input2,
		at::Tensor& gradoutput,
		at::Tensor& gradinput1,
		at::Tensor& gradinput2
		);


================================================
FILE: my_package/Interpolation/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='interpolation_cuda',
    ext_modules=[
        CUDAExtension('interpolation_cuda', [
            'interpolation_cuda.cc',
            'interpolation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/InterpolationCh/InterpolationChLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import interpolationch_cuda as my_lib

#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class InterpolationChLayer(Function):
    def __init__(self,ch):
        super(InterpolationChLayer,self).__init__()
        self.ch = ch

    @staticmethod
    def forward(ctx, input1,input2):

        assert(input1.is_contiguous())
        assert(input2.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?

        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # output =  torch.zeros(input1.size())

        if input1.is_cuda :
            # output = output.cuda()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            my_lib.InterpolationChLayer_gpu_forward(input1, input2, output)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            output = torch.FloatTensor().resize_(input1.size()).zero_()
            my_lib.InterpolationChLayer_cpu_forward(input1, input2, output)
        ctx.save_for_backward(input1, input2)
        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())
        # gradinput2 = torch.zeros(self.input2.size())

        input1, input2 = ctx.saved_tensors

        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            # gradinput2 = gradinput2.cuda(self.device)
            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()
            # the input1 image should not require any gradients
            # print("Does input1 requires gradients? " + str(self.input1.requires_grad))

            err = my_lib.InterpolationChLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()

            err = my_lib.InterpolationChLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, gradinput2

================================================
FILE: my_package/InterpolationCh/InterpolationChModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from .InterpolationChLayer import InterpolationChLayer

class InterpolationChModule(Module):
    def __init__(self,ch):
        super(InterpolationChModule, self).__init__()
        self.ch = ch
        # self.f = InterpolationChLayer(ch)

    def forward(self, input1, input2):
        return InterpolationChLayer.apply(input1, input2)

    #we actually dont need to write the backward code for a module, since we have 


================================================
FILE: my_package/InterpolationCh/__init__.py
================================================
from  .InterpolationChModule import *


================================================
FILE: my_package/InterpolationCh/interpolationch_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "interpolationch_cuda_kernel.cuh"


int InterpolationChLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  output
		)
		{
	int error = 1 ;

	int channel = input1.size( 1);
//	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);

	error =InterpolationChLayer_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1,
			input2,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}


int InterpolationChLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
    {
	int error = 1 ;
	int channel = input1.size( 1);
//	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != 2) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h) return error;// to add some checkpoint
	if(input2.size(3) != w) return error;


	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input2_b_stride != gradinput2.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;
	if(input2_c_stride != gradinput2.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = InterpolationChLayer_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1,
			input2,
			gradoutput,
			gradinput1,
			gradinput2
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("InterpolationChLayer_gpu_forward", &InterpolationChLayer_gpu_forward, "InterpolationCh forward (CUDA)");
  m.def("InterpolationChLayer_gpu_backward", &InterpolationChLayer_gpu_backward, "InterpolationCh backward (CUDA)");
}


================================================
FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cu
================================================
#include <stdio.h>

#include "interpolationch_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void InterpolationChLayer_gpu_forward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		const scalar_t* __restrict__ input1,
		const scalar_t* __restrict__ input2,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {

		float fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];
		float fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];

		float x2 = (float)(w_i) + fx;
		float y2 = (float)(h_i) + fy;

		if(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){
			int ix2_L = int(x2);
			int iy2_T = int(y2);
			int ix2_R = min(ix2_L + 1, w - 1);
			int iy2_B = min(iy2_T + 1, h - 1);

			float alpha = x2 - ix2_L;
			float beta = y2 - iy2_T;

			for(int c_i = 0 ; c_i < channel ; c_i ++){
				float TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
				float TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];
				float BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];
				float BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
				output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =
					(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;
			}
		} else{
			//the warping data is out of range, we fill it with zeros
			for(int c_i = 0 ;  c_i < channel; c_i ++){
				output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;
			}
		}
	}

	return ;

}

template <typename scalar_t>
__global__ void InterpolationChLayer_gpu_backward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		const scalar_t* __restrict__  input1,
		const scalar_t* __restrict__  input2,
		const scalar_t* __restrict__  gradoutput,
		scalar_t*  gradinput1,
		scalar_t*  gradinput2
		)
{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){

		float fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];
		float fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];

		float x2 = float(w_i) + fx;
		float y2 = float(h_i) + fy;

		if(x2 >= 0.0f  && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){
			int ix2_L = int(x2);
			int iy2_T = int(y2);

			int ix2_R  = min(ix2_L+ 1, w - 1);
			int iy2_B = min(iy2_T + 1, h - 1);

			float alpha = x2 - ix2_L;
			float beta = y2 - iy2_T;

			for (int c_i = 0 ; c_i < channel; c_i++){
				float gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];

				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);
				atomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);

			}

			float gamma  = iy2_B - y2;

			float bot_diff = 0.0f;
			for(int c_i =0 ; c_i< channel; c_i ++ ){
				float temp = 0;
				temp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -
						input1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);
				temp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -
						input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);

				float warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];
				bot_diff += warped_diff_value * temp  ;


			}
			//the gradients of the x direction/ horizontal direction
			gradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;

			gamma = ix2_R- x2;
			bot_diff = 0.0f;
			for(int c_i = 0 ; c_i < channel;c_i ++ ){
				float temp = 0.0f;
				temp += gamma    * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -
						input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);

				temp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -
						input1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);

				float warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];
				bot_diff += warped_diff_value * temp;


			}
			gradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;

		}


	}
	return ;

}


int InterpolationChLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  output
		)
{
	int error = 1 ;


	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	//extract the data of CudaTensor and use kernel to calculate.
		AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_forward_kernelfunc", ([&] {
	InterpolationChLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),output.data<scalar_t>()
			);
 					}));

	//			THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}

int InterpolationChLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{
	int error = 1 ;

	dim3 grid;
	dim3 block;

	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
			AT_DISPATCH_FLOATING_TYPES(input1.type(), "InterpolationChLayer_gpu_backward_kernelfunc", ([&] {
InterpolationChLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,

			input1.data<scalar_t>(),
			input2.data<scalar_t>(),
			gradoutput.data<scalar_t>(),
			gradinput1.data<scalar_t>(),
			gradinput2.data<scalar_t>()
			);
 					}));

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


================================================
FILE: my_package/InterpolationCh/interpolationch_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>


int InterpolationChLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor& input1,
		at::Tensor& input2,
		at::Tensor& output

		);
 
int InterpolationChLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,

		at::Tensor& input1,
		at::Tensor& input2,
		at::Tensor& gradoutput,
		at::Tensor& gradinput1,
		at::Tensor& gradinput2
		);


================================================
FILE: my_package/InterpolationCh/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='interpolationch_cuda',
    ext_modules=[
        CUDAExtension('interpolationch_cuda', [
            'interpolationch_cuda.cc',
            'interpolationch_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/MinDepthFlowProjection/__init__.py
================================================
from  .minDepthFlowProjectionModule import *


================================================
FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
#import _ext.my_lib as my_lib
import mindepthflowprojection_cuda as my_lib

class minDepthFlowProjectionLayer(Function):
    def __init__(self,requires_grad):
        super(minDepthFlowProjectionLayer,self).__init__()
        # self.requires_grad = requires_grad

    @staticmethod
    def forward(ctx, input1, input2, requires_grad):
        # print("Depth Aware Flow Projection")
        assert(input1.is_contiguous())
        assert(input2.is_contiguous())
        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        # self.input2 = input2.contiguous()
        fillhole = 1 if requires_grad == False else 0
        # if input1.is_cuda:
        #     self.device = torch.cuda.current_device()
        # else:
        #     self.device = -1

        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections
        # output = torch.zeros(input1.size())

        if input1.is_cuda:
            # output = output.cuda()
            # count = count.cuda()
            # print("correct")
            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.minDepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)
        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()
            output = torch.FloatTensor().resize_(input1.size()).zero_()
            err = my_lib.minDepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)
        if err != 0:
            print(err)
        # output = output/count # to divide the counter

        # self.count = count #to keep this
        # self.output = output

        ctx.save_for_backward(input1, input2,count,output)
        ctx.fillhole = fillhole

        # print(self.input1[0, 0, :10, :10])
        # print(self.count[0, 0, :10, :10])
        # print(self.input1[0, 0, -10:, -10:])
        # print(self.count[0, 0, -10:, -10:])

        # the function returns the output to its caller
        return output

    @staticmethod
    def backward(ctx, gradoutput):
        # print("Backward of Filter Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        # gradinput1 = torch.zeros(self.input1.size())

        input1, input2, count, output = ctx.saved_tensors
        # fillhole = ctx.fillhole

        if input1.is_cuda:
            # print("CUDA backward")
            # gradinput1 = gradinput1.cuda(self.device)
            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()

            err = my_lib.minDepthFlowProjectionLayer_gpu_backward(input1,input2,
                                                               count, output,
                                                               gradoutput, gradinput1,gradinput2)
            # print(err)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()
            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()
            err = my_lib.minDepthFlowProjectionLayer_cpu_backward(input1, input2,
                                                               count, output,
                                                               gradoutput, gradinput1,gradinput2)
            # print(err)
            if err != 0:
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1,gradinput2,None


================================================
FILE: my_package/MinDepthFlowProjection/minDepthFlowProjectionModule.py
================================================
# modules/FlowProjectionModule.py
from torch.nn.modules.module import Module
from .minDepthFlowProjectionLayer import minDepthFlowProjectionLayer #, FlowFillholeLayer

__all__ =['minDepthFlowProjectionModule']

class minDepthFlowProjectionModule(Module):
    def __init__(self, requires_grad = True):
        super(minDepthFlowProjectionModule, self).__init__()
        self.requires_grad = requires_grad
        # self.f = minDepthFlowProjectionLayer(requires_grad)

    def forward(self, input1, input2):
        return minDepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)

# class FlowFillholeModule(Module):
#     def __init__(self,hole_value = -10000.0):
#         super(FlowFillholeModule, self).__init__()
#         self.f = FlowFillholeLayer()
#
#     def forward(self, input1):
#         return self.f(input1)

    #we actually dont need to write the backward code for a module, since we have


================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda.cc
================================================
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "mindepthflowprojection_cuda_kernel.cuh"


int minDepthFlowProjectionLayer_gpu_forward(
		at::Tensor&  input1,
        at::Tensor&  input2,
        at::Tensor&  count,
		at::Tensor&  output,
		int fillhole
		)
{

	int error = 1 ;

	int channel = input1.size( 1);
	if(channel!= 2) return error;
	int batch = input1.size(0);

	int h = input1.size(2);
	int w = input1.size(3);

    if(input2.size(1) !=1 ) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != output.stride(0)) return error;
	if(input1_c_stride != output.stride(1)) return error;

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);
//    printf("In gpu forward\n");
	error = minDepthFlowProjection_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,fillhole,

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
			input2,
			count,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

}

int minDepthFlowProjectionLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
        at::Tensor&  count,
		at::Tensor&  output,
        at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{
	int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=2) return error;
	int batch = input1.size(0);
	if(count.size( 0) != batch) return error;
	if(count.size(1) != 1) return error;

	int h = input1.size(2);
	int w = input1.size(3);
    if(input2.size(1) !=1 ) return error;
    if(count.size(2) != h) return error;// to add some checkpoint
	if(count.size(3) != w) return error;

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

	int count_b_stride = count.stride(0);
	int count_c_stride = count.stride(1);
	int count_h_stride = count.stride(2);
	int count_w_stride = count.stride(3);
	//TODO: do we need to assert the w_stride to be 1
	//if(w_stride !=1) return error;
	if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);
//    printf("GPU backward: %d,%d,%d,%d\n", count_b_stride,count_c_stride,count_h_stride,count_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = minDepthFlowProjection_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1,
            input2,
            count,
            output,
			gradoutput,
			gradinput1,
			gradinput2
			);
	  if (error) {AT_ERROR("CUDA call failed");}
	  //printf("Am I good in backward function %d",error);

	return error;

}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("minDepthFlowProjectionLayer_gpu_forward", &minDepthFlowProjectionLayer_gpu_forward, "minDepthFlowProjection forward (CUDA)");
  m.def("minDepthFlowProjectionLayer_gpu_backward", &minDepthFlowProjectionLayer_gpu_backward, "minDepthFlowProjection backward (CUDA)");
}


================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cu
================================================
#include <stdio.h>

#include "mindepthflowprojection_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void minDepthFlowProjection_gpu_forward_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t* count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];
        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];

        float x2 = (float) (w_i) + fx;
        float y2 = (float) (h_i) + fy;
        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){
            int ix2_L = (int) (x2);
            int iy2_T = (int) (y2);
            int ix2_R = min(ix2_L + 1, w - 1);
            int iy2_B = min(iy2_T + 1, h - 1);

            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];
            float old_exist = 0;

            //while(1){
            old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L];
            if(temp > old_exist){
                output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] = -  fx; //update the new vector
                output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]  = -  fy;
                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] =   temp; // update to the best weight
                //if ( count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] == temp){
                //break;
                //}
            }
            //}

           // old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ;
           // if(temp > old_exist){
            //    output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]= - fx;
            //    output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] = - fy;
            //    count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]= temp ;
           // }

           // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L];
           // if(temp > old_exist){
            //    output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] = - fx;
           //     output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  = - fy;
           //     count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]  = temp;
           // }

           // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R];
           // if(temp> old_exist){
            //    output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] = - fx;
            //    output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  = - fy;
            //    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] = temp;
           // }
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void minDepthFlowFillhole_kernelfunc(
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t*  count,
		scalar_t* output
		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off = batch_i * input1_b_stride;

	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
	    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;
        if(temp <= 0.0f){
            //search along the four directions,0/90/180/270, until finding at least one
            int left_offset = w_i;            float left_temp = 0.0f;
            while(left_temp == 0.0f && left_offset - 1 >= 0){
                left_offset = left_offset - 1;
                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;
            }

            int right_offset = w_i ;            float right_temp = 0.0f;
            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){
                right_offset  = right_offset + 1 ;
                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;
            }

            int up_offset = h_i ;            float up_temp = 0.0f;
            while(up_temp == 0.0f && up_offset - 1 >=0){
                up_offset = up_offset - 1;
                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;
            }

            int down_offset = h_i;            float down_temp = 0.0f;
            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){
                down_offset = down_offset + 1;
                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;
            }

            if(left_temp + right_temp + up_temp + down_temp <=0.0f){
                //printf("Can't fill hole, find no neighbor vectors availabel\n");
                return;
            }

            left_temp = (left_temp > 0.0f)?1:0;
            right_temp = (right_temp > 0.0f)?1:0;
            up_temp = (up_temp > 0.0f)?1:0;
            down_temp = (down_temp > 0.0f)?1:0;

            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (
                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;


            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(
                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +
                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+
                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +
                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]
            )/(
                left_temp + right_temp + up_temp + down_temp
            ) ;
        }
	}
	return ;

}

template <typename scalar_t>
__global__ void minDepthFlowProjection_gpu_backward_kernelfunc(
		const int nElement,  	const int w, 	const int h, const int channel,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		const scalar_t* __restrict__  input1,	const scalar_t* __restrict__  input2,
		scalar_t*  count,
		scalar_t* output,
		const scalar_t* __restrict__  gradoutput,
		scalar_t*  gradinput1,
		scalar_t*  gradinput2
		)
{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w;
	const bool withinYbounds = h_i < h;

	const int batch_i = blockIdx.z;
	const int off  = batch_i * input1_b_stride;

	//    __syncthreads();

	if(withinXbounds && withinYbounds){
        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;
        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;

        float x2 = (float) ( w_i ) + fx;
        float y2 = (float) ( h_i ) + fy;
        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){
            int ix2_L = (int)(x2);
            int iy2_T = (int)(y2);
            int ix2_R  = min(ix2_L + 1, w-1);
            int iy2_B  = min(iy2_T + 1, h-1);

            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];

            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;
                        int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;
            if(temp == count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ){
                gradinput1[iu_offset] += - gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L];
                gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] ;
            }
            if(temp == count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R] ){
                gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ;
                gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] ;
            }
            if(temp==count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ){
                gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;
                gradinput1[iv_offset]  += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;
            }
            if(temp == count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ){
                gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] ;
                gradinput1[iv_offset]   += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R];
            }


            //int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;
            //gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
            //                                (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
            //                                (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
            //                                (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
            //                                (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );

            //gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /
            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *
            //                                (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );
            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /
            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *
            //                                (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );
            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /
            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *
            //                                (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );
            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /
            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *
             //                               (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );
        }
	}
	return ;

}


int minDepthFlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,	at::Tensor&  input2,
		at::Tensor&  count,
		at::Tensor&  output
		)
{
    int error = -1;


	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
//    printf("I am here\n");
	//extract the data of CudaTensor and use kernel to calculate.
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_forward", ([&] {

	minDepthFlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
			);
			
	}));
			
    cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am there\n");


	//			THCudaCheck(cudaGetLastError());
    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("I am dd\n");

    if(fillhole){

//        printf("use flow fill hole\n");
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowFillhole", ([&] {

        minDepthFlowFillhole_kernelfunc<<<grid,block,0,stream>>>(
    		nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()
        );
		}));

    err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		return error;
	}

    }

	error = 0;
	return error;

}

int minDepthFlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,		at::Tensor&  input2,
		at::Tensor&  count,        at::Tensor&  output,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		)
{

	int error = -1;

	dim3 grid;
	dim3 block;

	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	AT_DISPATCH_FLOATING_TYPES(input1.type(), "minDepthFlowProjection_gpu_backward", ([&] {

	minDepthFlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
            count_b_stride,count_c_stride,count_h_stride,count_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>(),
			gradoutput.data<scalar_t>(), gradinput1.data<scalar_t>(), gradinput2.data<scalar_t>()
			);
					}));

//    printf("gpu I am there\n");

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpu error in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}
//    printf("gpu I am here\n");

	error = 0;
	return error;


}

================================================
FILE: my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int minDepthFlowProjection_gpu_forward_kernel(
		cudaStream_t stream, 		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int fillhole,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,		at::Tensor&  input2,
		at::Tensor&  count,
		at::Tensor&  output

		);

int minDepthFlowProjection_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,
		const int h,
		const int channel,
		const int batch,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,

		at::Tensor&  input1,
		at::Tensor&  input2,
        at::Tensor&  count,
        at::Tensor&  output,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2
		);


================================================
FILE: my_package/MinDepthFlowProjection/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='mindepthflowprojection_cuda',
    ext_modules=[
        CUDAExtension('mindepthflowprojection_cuda', [
            'mindepthflowprojection_cuda.cc',
            'mindepthflowprojection_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/SeparableConv/SeparableConvLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import _ext.my_lib as my_lib

#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class SeparableConvLayer(Function):
    def __init__(self,filtersize):
        self.filtersize = filtersize
        super(SeparableConvLayer,self).__init__()

    def forward(self, input1,input2,input3):
        intBatches = input1.size(0)
        intInputDepth = input1.size(1)
        intInputHeight = input1.size(2)
        intInputWidth = input1.size(3)
        intFilterSize = min(input2.size(1), input3.size(1))
        intOutputHeight = min(input2.size(2), input3.size(2))
        intOutputWidth = min(input2.size(3), input3.size(3))

        assert(intInputHeight - self.filtersize == intOutputHeight - 1)
        assert(intInputWidth - self.filtersize == intOutputWidth - 1)
        assert(intFilterSize == self.filtersize)

        assert(input1.is_contiguous() == True)
        assert(input2.is_contiguous() == True)
        assert(input3.is_contiguous() == True)

        output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_()

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
        self.input3 = input3.contiguous()
        if input1.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        if input1.is_cuda :
            output = output.cuda()
            err = my_lib.SeparableConvLayer_gpu_forward(input1, input2,input3, output)

        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.SeparableConvLayer_cpu_forward(input1, input2,input3, output)
        if err != 0:
            print(err)
        # the function returns the output to its caller
        return output

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        # print("Backward of Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        gradinput1 = torch.zeros(self.input1.size())
        gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        if self.input1.is_cuda:
            # print("CUDA backward")
            gradinput1 = gradinput1.cuda(self.device)
            gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)

            # the input1 image should not require any gradients
            # print("Does input1 requires gradients? " + str(self.input1.requires_grad))

            err = my_lib.SeparableConvLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            err = my_lib.SeparableConvLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)
            # print(err)
            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, gradinput2,gradinput3

================================================
FILE: my_package/SeparableConv/SeparableConvModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from functions.SeparableConvLayer import SeparableConvLayer

class SeparableConvModule(Module):
    def __init__(self,filtersize):
        super(SeparableConvModule, self).__init__()
        self.f = SeparableConvLayer(filtersize)

    def forward(self, input1, input2, input3):
        return self.f(input1, input2, input3)

    #we actually dont need to write the backward code for a module, since we have 


================================================
FILE: my_package/SeparableConv/__init__.py
================================================
from  .SeparableConvModule import *


================================================
FILE: my_package/SeparableConv/separableconv_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "separableconv_cuda_kernel.cuh"


int SeparableConvLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		at::Tensor&  output

		)
		{
	int error = 1 ;

	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != input3.size(1)) return error; //change by zhenghe, am I right?

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
	if(input2.size(3) != w - input2.size(1) + 1) return error;


	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);

    int output_b_stride = output.stride(0);
	int output_c_stride = output.stride(1);
	int output_h_stride = output.stride(2);
	int output_w_stride = output.stride(3);
//    printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
    if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
    if(output_w_stride !=1) return error;

	if(input2_b_stride != input3_b_stride) return error;
	if(input2_c_stride != input3_c_stride) return error;


	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, output);


	error = SeparableConvLayer_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,  input2.size(1),

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			output_b_stride,output_c_stride,output_h_stride,output_w_stride,


			input1,
			input2,
			input3,
			output);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;

		}
int SeparableConvLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		at::Tensor&  gradoutput,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2,
		at::Tensor&  gradinput3
		)
		{


    int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != input2.size(1)) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
	if(input2.size(3) != w - input2.size(1) + 1) return error;


	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);

    int output_b_stride = gradoutput.stride(0);
	int output_c_stride = gradoutput.stride(1);
	int output_h_stride = gradoutput.stride(2);
	int output_w_stride = gradoutput.stride(3);

//    printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
	if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
    if(output_w_stride !=1) return error;

    if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input2_b_stride != gradinput2.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;
	if(input2_c_stride != gradinput2.stride(1)) return error;
	if(input3_c_stride != gradinput3.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);

	int	nElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);

	error  = SeparableConvLayer_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,  input2.size(1),

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			output_b_stride,output_c_stride,output_h_stride,output_w_stride,

			input1,
			input2,
			input3,
			gradoutput,
			gradinput1,
			gradinput2,
			gradinput3
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("SeparableConvLayer_gpu_forward", &SeparableConvLayer_gpu_forward, "SeparableConv forward (CUDA)");
  m.def("SeparableConvLayer_gpu_backward", &SeparableConvLayer_gpu_backward, "SeparableConv backward (CUDA)");
}


================================================
FILE: my_package/SeparableConv/separableconv_cuda_kernel.cu
================================================
#include <stdio.h>

#include "separableconv_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void SeparableConvLayer_gpu_forward_kernelfunc(
		const int nElement,
		const int w, 		const int h, 		const int channel, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		const scalar_t* __restrict__  input1,    		const scalar_t* __restrict__  input2,    	const scalar_t* __restrict__  input3, 	scalar_t*  output

		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w - filter_size + 1;
	const bool withinYbounds = h_i < h - filter_size + 1;

	const int batch_i = blockIdx.z;


	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {

		for ( int c_i = 0 ; c_i < channel ; c_i ++){

			float out = 0.0f;
			for (int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {
			for (int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {
				float temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)];
				float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];
				float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];
				out += temp1* temp2 * temp3;
			}
			}
			output[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ] = out;
		}
	}
	return ;

}
 

template <typename scalar_t>
__global__ void SeparableConvLayer_gpu_backward_kernelfunc(
		const int nElement, 	   const int w, 		const int h, 		const int channel, const int filter_size,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		const scalar_t* __restrict__  input1,        		const scalar_t* __restrict__  input2,		const scalar_t* __restrict__  input3,
		const scalar_t* __restrict__  gradoutput,    		scalar_t*  gradinput1,  		scalar_t*  gradinput2,  		scalar_t* gradinput3
		)
		{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w - filter_size + 1;
	const bool withinYbounds = h_i < h - filter_size + 1;

	const int batch_i = blockIdx.z;

	if(withinXbounds && withinYbounds){

		for (int c_i = 0 ; c_i < channel ; c_i ++){
				for (int   intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {
				for ( int  intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {
					float temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)];
					float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];
					float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];

					float gradout = gradoutput[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ];

					atomicAdd(&gradinput1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)],
						gradout * temp2 * temp3);
					atomicAdd(&gradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride  +  h_i * input2_h_stride + w_i ],
						gradout * temp1 * temp3);
					atomicAdd(&gradinput3 [batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] ,
						gradout * temp1 * temp2);
				}
				}
		}

	}
	return ;

}


int SeparableConvLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch,const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		at::Tensor&  input1,    		at::Tensor&  input2,    	at::Tensor&  input3, 	at::Tensor&  output

		)
{
	int error = 1 ;

	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w  - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	//extract the data of CudaTensor and use kernel to calculate.
				AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
SeparableConvLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel, filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			output_b_stride,output_c_stride,output_h_stride,output_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), output.data<scalar_t>()
			);
 					}));

	//			THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


int SeparableConvLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch, const int filter_size,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		at::Tensor&  input1,        		at::Tensor&  input2,		at::Tensor&  input3,

		at::Tensor&  gradoutput,    		at::Tensor&  gradinput1,  		at::Tensor&  gradinput2,  		at::Tensor&  gradinput3
		)
{

	int error = 1 ;

	dim3 grid;
	dim3 block;


	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);

//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));

				AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
SeparableConvLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel, filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			output_b_stride,output_c_stride,output_h_stride,output_w_stride,


			input1.data<scalar_t>(), 			input2.data<scalar_t>(),         input3.data<scalar_t>(),  			gradoutput.data<scalar_t>(),
			gradinput1.data<scalar_t>(), 			gradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()
			);
 					}));

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}

================================================
FILE: my_package/SeparableConv/separableconv_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int SeparableConvLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		at::Tensor& input1,    		at::Tensor& input2,    	at::Tensor& input3, 	at::Tensor& output

		);

int SeparableConvLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,

		at::Tensor& input1,        		at::Tensor& input2,		at::Tensor& input3,

		at::Tensor& gradoutput,    		at::Tensor& gradinput1,  		at::Tensor& gradinput2,  		at::Tensor& gradinput3
		);


================================================
FILE: my_package/SeparableConv/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='separableconv_cuda',
    ext_modules=[
        CUDAExtension('separableconv_cuda', [
            'separableconv_cuda.cc',
            'separableconv_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/SeparableConvFlow/SeparableConvFlowLayer.py
================================================
# this is for wrapping the customized layer
import torch
from torch.autograd import Function
import separableconvflow_cuda as my_lib
import warnings
#Please check how the STN FUNCTION is written :
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py
#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py

class SeparableConvFlowLayer(Function):
    def __init__(self,filtersize):
        self.filtersize = filtersize
        warnings.warn("\nSeparable Conv Flow Layer is not precise enough for optical flow due to a divison operation")
        super(SeparableConvFlowLayer,self).__init__()

    def forward(self, input1,input2,input3):
        intBatches = input1.size(0)
        intInputDepth = input1.size(1)
        intInputHeight = input1.size(2)
        intInputWidth = input1.size(3)
        intFilterSize = min(input2.size(1), input3.size(1))
        intOutputHeight = min(input2.size(2), input3.size(2))
        intOutputWidth = min(input2.size(3), input3.size(3))

        assert(intInputHeight - self.filtersize == intOutputHeight - 1)
        assert(intInputWidth - self.filtersize == intOutputWidth - 1)
        assert(intFilterSize == self.filtersize)

        assert(input1.is_contiguous() == True)
        assert(input2.is_contiguous() == True)
        assert(input3.is_contiguous() == True)

        # output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_()
        flow_ouput = torch.zeros(intBatches, 2,intOutputHeight, intOutputWidth) # as a byproduct of SepConv, but no

        # assert(input1.is_contiguous())
        # assert(input2.is_contiguous())
        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it
        self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?
        self.input3 = input3.contiguous()
        if input1.is_cuda:
            self.device = torch.cuda.current_device()
        else:
            self.device = -1

        if input1.is_cuda :
            # output = output.cuda()
            flow_ouput = flow_ouput.cuda()
            err = my_lib.SeparableConvFlowLayer_gpu_forward(input1, input2,input3,flow_ouput)

        else:
            # output = torch.cuda.FloatTensor(input1.data.size())
            err = my_lib.SeparableConvFlowLayer_cpu_forward(input1, input2,input3,flow_ouput)
        if err != 0:
            print(err)
        # the function returns the output to its caller
        return flow_ouput

    #TODO: if there are multiple outputs of this function, then the order should be well considered?
    def backward(self, gradoutput):
        # print("Backward of Interpolation Layer")
        # gradinput1 = input1.new().zero_()
        # gradinput2 = input2.new().zero_()
        gradinput1 = torch.zeros(self.input1.size()) # the input1 has zero gradient because flow backprop. nothing to gradinput1
        gradinput2 = torch.zeros(self.input2.size())
        gradinput3 = torch.zeros(self.input3.size())
        if self.input1.is_cuda:
            # print("CUDA backward")
            gradinput1 = gradinput1.cuda(self.device)
            gradinput2 = gradinput2.cuda(self.device)
            gradinput3 = gradinput3.cuda(self.device)

            # the input1 image should not require any gradients
            # print("Does input1 requires gradients? " + str(self.input1.requires_grad))

            # err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)
            err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)
            if err != 0 :
                print(err)

        else:
            # print("CPU backward")
            # print(gradoutput)
            # print(err)
            # err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)
            err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)

            if err != 0 :
                print(err)
            # print(gradinput1)
            # print(gradinput2)

        # print(gradinput1)

        return gradinput1, gradinput2,gradinput3

================================================
FILE: my_package/SeparableConvFlow/SeparableConvFlowModule.py
================================================
# modules/InterpolationLayer.py
from torch.nn import Module
from .SeparableConvFlowLayer import SeparableConvFlowLayer
import  torch
class SeparableConvFlowModule(Module):
    def __init__(self,filtersize):
        super(SeparableConvFlowModule, self).__init__()
        self.f = SeparableConvFlowLayer(filtersize)

    def forward(self, input1, input2, input3):
        # temp2 = torch.div(input2, torch.sum(input2,dim=1,keepdim=True))
        return self.f(input1, input2, input3)

    #we actually dont need to write the backward code for a module, since we have 


================================================
FILE: my_package/SeparableConvFlow/__init__.py
================================================
from  .SeparableConvFlowModule import *


================================================
FILE: my_package/SeparableConvFlow/separableconvflow_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h> //works for 1.0.0

#include "separableconvflow_cuda_kernel.cuh"

int SeparableConvFlowLayer_gpu_forward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		//at::Tensor&  output,
		at::Tensor&  flow_output

		)
		{
	int error = 1 ;
    //int point  =0 ;printf("debug point  %d\n", point++ );

	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size(0) != batch) return error;
	if(input2.size(1) != input2.size(1)) return error;
    //printf("debug point  %d\n", point++ );

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
	if(input2.size(3) != w - input2.size(1) + 1) return error;
	

	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);

    //int output_b_stride = output.stride(0);
	//int output_c_stride = output.stride(1);
	//int output_h_stride = output.stride(2);
	//int output_w_stride = output.stride(3);
	
    int flow_output_b_stride = flow_output.stride(0);
	int flow_output_c_stride = flow_output.stride(1);
	int flow_output_h_stride = flow_output.stride(2);
	int flow_output_w_stride = flow_output.stride(3);	
    //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
    if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
   // if(output_w_stride !=1) return error;
	if(flow_output_w_stride !=1) return error;


	if(input2_b_stride != input3_b_stride) return error;
	if(input2_c_stride != input3_c_stride) return error;
    //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	int	nElement = 0;//UNUSED  0;//UNUSED  THCudaTensor_nElement(state, flow_output);


	error = SeparableConvFlowLayer_gpu_forward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement,w,h,channel,batch,  input2.size(1),

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
		//	output_b_stride,output_c_stride,output_h_stride,output_w_stride,
			flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,


			input1,
			input2,
			input3,
			//output ,
			flow_output 
			
			);
	  if (error) {AT_ERROR("CUDA call failed");}
    //printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);

	return error;

		}
int SeparableConvFlowLayer_gpu_backward(
		at::Tensor&  input1,
		at::Tensor&  input2,
		at::Tensor&  input3,
		at::Tensor&  gradflow_output,
		at::Tensor&  gradinput1,
		at::Tensor&  gradinput2,
		at::Tensor&  gradinput3
		)
		{


    int error = 1 ;
	int channel = input1.size( 1);
	if(channel!=3) return error;
	int batch = input1.size(0);
	if(input2.size( 0) != batch) return error;
	if(input2.size(1) != input2.size(1)) return error;

	int h = input1.size(2);
	int w = input1.size(3);
	if(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint
	if(input2.size(3) != w - input2.size(1) + 1) return error;


	int input1_b_stride = input1.stride(0);
	int input1_c_stride = input1.stride(1);
	int input1_h_stride = input1.stride(2);
	int input1_w_stride = input1.stride(3);

	int input2_b_stride = input2.stride(0);
	int input2_c_stride = input2.stride(1);
	int input2_h_stride = input2.stride(2);
	int input2_w_stride = input2.stride(3);

    int input3_b_stride = input3.stride(0);
	int input3_c_stride = input3.stride(1);
	int input3_h_stride = input3.stride(2);
	int input3_w_stride = input3.stride(3);

    //int output_b_stride = gradoutput.stride(0);
	//int output_c_stride = gradoutput.stride(1);
	//int output_h_stride = gradoutput.stride(2);
	//int output_w_stride = gradoutput.stride(3);
	
    int flow_output_b_stride = gradflow_output.stride(0);
	int flow_output_c_stride = gradflow_output.stride(1);
	int flow_output_h_stride = gradflow_output.stride(2);
	int flow_output_w_stride = gradflow_output.stride(3);		

//    printf("filter tensor shape: %d,%d,%d,%d\n", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);


	//TODO: do we need to assert the w_stride to be 1
	if(input1_w_stride !=1) return error;
	if(input2_w_stride !=1) return error;
    if(input3_w_stride !=1) return error;
  //  if(output_w_stride !=1) return error;
	if(flow_output_w_stride !=1) return error;

    if(input1_b_stride != gradinput1.stride(0)) return error;
	if(input2_b_stride != gradinput2.stride(0)) return error;
	if(input1_c_stride != gradinput1.stride(1)) return error;
	if(input2_c_stride != gradinput2.stride(1)) return error;
	if(input3_c_stride != gradinput3.stride(1)) return error;

//    printf("GPU backward: %d,%d,%d,%d\n", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);

	int	nElement = 0;//UNUSED  0;//UNUSED  THCudaTensor_nElement(state, gradflow_output);

	error  = SeparableConvFlowLayer_gpu_backward_kernel(
//			at::globalContext().getCurrentCUDAStream(), //works for 0.4.1
           at::cuda::getCurrentCUDAStream(), //works for 1.0.0
			nElement, //to let the nummous
			w,h,channel,batch,  input2.size(1),

			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
		//	output_b_stride,output_c_stride,output_h_stride,output_w_stride,
			flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,

			input1,
			input2,
			input3,
			gradflow_output,
			gradinput1,
			gradinput2,
			gradinput3
			);
	  if (error) {AT_ERROR("CUDA call failed");}

	return error;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("SeparableConvFlowLayer_gpu_forward", &SeparableConvFlowLayer_gpu_forward, "SeparableConvFlow forward (CUDA)");
  m.def("SeparableConvFlowLayer_gpu_backward", &SeparableConvFlowLayer_gpu_backward, "SeparableConvFlow backward (CUDA)");
}


================================================
FILE: my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cu
================================================
#include <stdio.h>

#include "separableconvflow_cuda_kernel.cuh"


#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>


#define min(a,b) ((a<b)?(a):(b))
#define max(a,b) ((a>b)?(a):(b))

#define DEBUG (0)
#ifndef BLOCKDIMX
#define BLOCKDIMX (32)
#endif
#ifndef BLOCKDIMY
#define BLOCKDIMY (16)
#endif
using at::Half;


//forward path of our layer
template <typename scalar_t>
__global__ void SeparableConvFlowLayer_gpu_forward_kernelfunc(
		const int nElement,
		const int w, 		const int h, 		const int channel, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		const scalar_t* __restrict__   input1,    		const scalar_t* __restrict__   input2,    	const scalar_t* __restrict__   input3, 	 scalar_t*  flow_output

		)
{

	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	//only use one dimensioon of the grid and block
	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w - filter_size + 1;
	const bool withinYbounds = h_i < h - filter_size + 1;

	const int batch_i = blockIdx.z;


	//    __syncthreads();
//	const float fillvalue =0.0f;

	if( withinXbounds && withinYbounds) {
 
		float flow_y = 0.0f;
		float sum_weights = 0.0f;
		for (  int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {
			float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];
			flow_y += (float)(intFilterY) * temp2 ;
			sum_weights += 			temp2;
		}
		//sum_weights = fabs(sum_weights);
		flow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0;
		flow_output[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = 
					fabs(sum_weights) > 0.0f ?  flow_y : -2000;

		float flow_x = 0.0f;
		float sum_weights_x = 0.0f;
		for (   int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {
			float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];
			flow_x += (float)(intFilterX)  * temp3;
			sum_weights_x += 		 temp3;
		}
		//sum_weights_x = fabs(sum_weights_x);
		flow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0;
		// what if the sum_weight is less than zeros.
		flow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] =
					fabs(sum_weights_x) >0.0f ? flow_x : -2000;
	}
	return ;

}


template <typename scalar_t>
__global__ void SeparableConvFlowLayer_gpu_backward_kernelfunc(
		const int nElement, 	   const int w, 		const int h, 		const int channel, const int filter_size,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		const scalar_t* __restrict__      input1,        		const scalar_t* __restrict__    input2,		const scalar_t* __restrict__      input3,
		const scalar_t* __restrict__      gradflow_output,    		scalar_t*  gradinput1,  		scalar_t*  gradinput2,  		scalar_t*  gradinput3
		)
		{
	//blockIdx.z : batch index from 0~B-1
	//blockIdx.y : height patch index from ceil(h/16)
	//blockIdx.x : width patch index from ceil(w/32)

	//threadidx.x: width index 0~31
	//threadIdx.y: height index 0~15
	//threadIdx.z: Not used

	const int w_i = blockIdx.x * blockDim.x + threadIdx.x;
	const int h_i = blockIdx.y * blockDim.y + threadIdx.y;
	const bool withinXbounds = w_i < w - filter_size + 1;
	const bool withinYbounds = h_i < h - filter_size + 1;

	const int batch_i = blockIdx.z;

	if(withinXbounds && withinYbounds){
		float flow_y = 0.0f;
		float sum_weights = 0.0f;
		
		for ( int  intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {
			float temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];
			flow_y += (float)(intFilterY) * temp2 ;
			sum_weights += 			temp2;
		}
		//flow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0;
		//flow_output_data[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = 
		//		sum_weights >0.0f ?  flow_y : -2000;
		//float sign = sum_weights >0.0f ? 1.0f : -1.0f;
		//sum_weights = fabs(sum_weights);
		if(fabs(sum_weights) >0.0f ){
			float gradflow_y = gradflow_output[batch_i * flow_output_b_stride + 1* flow_output_c_stride + 
								h_i * flow_output_h_stride + w_i ] ;					
			float offset = flow_y / ( sum_weights * sum_weights);
			for (int  intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {
				gradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride  +  h_i * input2_h_stride + w_i ] =
							gradflow_y *  ((float)(intFilterY) / sum_weights -  offset);
			}
		}
		
		
		float flow_x = 0.0f;
		float sum_weights_x = 0.0f;
		for ( int  intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {
			float temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];
			flow_x += (float)(intFilterX)  * temp3;
			sum_weights_x += 		 temp3;
		}
		//flow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0;
		//flow_output_data[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] =
		//			sum_weights_x >0 ? flow_x : -2000;
		//float sign_x = sum_weights_x >0.0f ? 1.0f : -1.0f;
		//sum_weights_x = fabs(sum_weights_x);	
		if(fabs(sum_weights_x) > 0.0f ){
			 float gradflow_x = gradflow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + 
									h_i * flow_output_h_stride + w_i];
			float offset  = flow_x / (sum_weights_x * sum_weights_x);
			for ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {
				gradinput3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] +=
						gradflow_x * ((float)(intFilterX) /sum_weights_x - offset);
			}
		}
	}
	return ;

}


int SeparableConvFlowLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch,const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		at::Tensor&  input1,    		at::Tensor&  input2,    	at::Tensor&  input3,   at::Tensor&  flow_output

		)
{
	int error = 1 ;

	dim3 grid;
	dim3 block;


	//		blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w  - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);
	//extract the data of CudaTensor and use kernel to calculate.
				AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {
SeparableConvFlowLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(
			nElement, //to let the nummous
			w,h,channel, filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			//output_b_stride,output_c_stride,output_h_stride,output_w_stride,
			flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,

			input1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), flow_output.data<scalar_t>()
			);
 					}));

	//			THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in SeparableConvFlowLayer_gpu_forward_kernel: %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


int SeparableConvFlowLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch, const int filter_size,
		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
		//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		at::Tensor&  input1,        		at::Tensor&  input2,		at::Tensor&  input3,

		at::Tensor&  gradflow_output,    		at::Tensor&  gradinput1,  		at::Tensor&  gradinput2,  		at::Tensor&  gradinput3
		)
{

	int error = 1 ;

	dim3 grid;
	dim3 block;


	//blockthread = 128;
	//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z
	//the three channels are processsed in one kernel
	block  = dim3(BLOCKDIMX,BLOCKDIMY,1);
	grid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch);
    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)
        printf("BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \n", BLOCKDIMX,BLOCKDIMY);

//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));
//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));
			AT_DISPATCH_FLOATING_TYPES(input1.type(), "DepthFlowProjection_gpu_backward", ([&] {

	SeparableConvFlowLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(
			nElement, //to let the nummous
			w,h,channel, filter_size,
			input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,
			input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,
			input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,
			//output_b_stride,output_c_stride,output_h_stride,output_w_stride,
			flow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,


			input1.data<scalar_t>(), 			input2.data<scalar_t>(),         input3.data<scalar_t>(),  			gradflow_output.data<scalar_t>(),
			gradinput1.data<scalar_t>(), 			gradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()
			);
 					}));

	cudaError_t err = cudaGetLastError();

	if (err != cudaSuccess) {
		printf("gpuerror in BilinearSampler.updateGradInput %s\n", cudaGetErrorString(err));
		//THError("aborting");
		return error;
	}

	error = 0;
	return error;

}


================================================
FILE: my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int SeparableConvFlowLayer_gpu_forward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w, 		const int h, 		const int channel, 		const int batch, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
	//	const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		at::Tensor& input1,    		at::Tensor& input2,    	at::Tensor& input3, 	 at::Tensor& flow_output

		);

int SeparableConvFlowLayer_gpu_backward_kernel(
		cudaStream_t stream,
		const int nElement,
		const int w,    		const int h,    		const int channel,  		const int batch, const int filter_size,

		const int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,
		const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,
		const int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,
	//	const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,
		const int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,

		at::Tensor& input1,        		at::Tensor& input2,		at::Tensor& input3,

		at::Tensor& gradflow_output,    		at::Tensor& gradinput1,  		at::Tensor& gradinput2,  		at::Tensor& gradinput3
		);
		

================================================
FILE: my_package/SeparableConvFlow/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from compiler_args import nvcc_args, cxx_args

setup(
    name='separableconvflow_cuda',
    ext_modules=[
        CUDAExtension('separableconvflow_cuda', [
            'separableconvflow_cuda.cc',
            'separableconvflow_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: my_package/build.sh
================================================
#!/usr/bin/env bash

echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0

export PYTHONPATH=$PYTHONPATH:$(pwd)

cd MinDepthFlowProjection
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd FlowProjection
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd SeparableConv
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd InterpolationCh
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd DepthFlowProjection
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd Interpolation
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd SeparableConvFlow
rm -rf build *.egg-info dist
python setup.py install
cd ..

cd FilterInterpolation
rm -rf build *.egg-info dist
python setup.py install
cd ..


================================================
FILE: my_package/clean.sh
================================================
#!/usr/bin/env bash

echo "Need pytorch>=1.0.0"
source activate pytorch1.0.0

cd MinDepthFlowProjection
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd FlowProjection
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd SeparableConv
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd InterpolationCh
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd DepthFlowProjection
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd Interpolation
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd SeparableConvFlow
rm -rf build *.egg-info dist
#python setup.py install
cd ..

cd FilterInterpolation
rm -rf build *.egg-info dist
#python setup.py install
cd ..


================================================
FILE: my_package/compiler_args.py
================================================
# References: https://developer.nvidia.com/cuda-gpus
nvcc_args = [
    # Tesla: K80, K80
    # Quadro: (None)
    # NVIDIA NVS: (None)
    # Jetson: (None)
    '-gencode', 'arch=compute_37,code=sm_37',

    # Tesla: (None)
    # Quadro: K1200, K620, M1200, M520, M5000M, M4000M, M3000M, M2000M, M1000M, K620M, M600M, M500M
    # NVIDIA NVS: 810
    # GeForce / Titan: GTX 750 Ti, GTX 750, GTX 960M, GTX 950M, 940M, 930M, GTX 860M, GTX 850M, 840M, 830M
    # Jetson: (None)
    '-gencode', 'arch=compute_50,code=sm_50',

    # Tesla: M60, M40
    # Quadro: M6000 24GB, M6000, M5000, M4000, M2000, M5500M, M2200, M620
    # NVIDIA NVS: (None)
    # GeForce / Titan: GTX TITAN X, GTX 980 Ti, GTX 980, GTX 970, GTX 960, GTX 950, GTX 980, GTX 980M, GTX 970M, GTX 965M, 910M
    # Jetson: (None)
    '-gencode', 'arch=compute_52,code=sm_52',

    # Tesla: P100
    # Quadro: GP100
    # NVIDIA: NVS: (None)
    # GeForce / Titan: (None)
    # Jetson: (None)
    '-gencode', 'arch=compute_60,code=sm_60',

    # Tesla: P40, P4
    # Quadro: P6000, P5000, P4000, P2200, P2000, P1000, P620, P600, P400, P620, P520, P5200, P4200, P3200, P5000, P4000, P3000, P2000, P1000, P600, P500
    # NVIDIA NVS: (None)
    # GeForce / Titan: TITAN Xp, TITAN X, GTX 1080 Ti, GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1080, GTX 1070, GTX 1060
    # Jetson: (None)
    '-gencode', 'arch=compute_61,code=sm_61',

    # Tesla: T4
    # Quadro: RTX 8000, RTX 6000, RTX 5000, RTX 4000, RTX 5000, RTX 4000, RTX 3000, T2000, T1000
    # NVIDIA NVS: (None)
    # GeForce / Titan: TITAN RTX, RTX 2080 Ti, RTX 2080, RTX 2070, RTX 2060, RTX 2080, RTX 2070, RTX 2060
    # Jetson: (None)
    '-gencode', 'arch=compute_75,code=sm_75',

    # '-gencode', 'arch=compute_70,code=sm_70',
    # '-gencode', 'arch=compute_70,code=compute_70'

    '-w' # Ignore compiler warnings.
]

cxx_args = ['-std=c++11', '-w']

================================================
FILE: my_package/test_module.py
================================================
# main.py
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.autograd import gradcheck

#from modules.InterpolationModule import InterpolationModule
#from modules.FilterInterpolationModule import FilterInterpolationModule
#from modules.FlowProjectionModule import FlowProjectionModule
from my_package.DepthFlowProjection import DepthFlowProjectionModule

#from modules.FilterInterpolationModule import AdaptiveWeightInterpolationModule
#from modules.SeparableConvModule import SeparableConvModule
import time
import numpy
#from modules.InterpolationChModule import InterpolationChModule
#from modules.WeigtedFlowProjectionModule import WeightedFlowProjectionModule
#from modules.SeparableConvFlowModule import SeparableConvFlowModule

def test_SeparableConvFlowModule(input1, input2, input3,filtersize):
    FilterInterpolate = SeparableConvFlowModule(filtersize)

    t1 = time.time()

    output = FilterInterpolate(input1, input2, input3)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()

    print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)
    t1 = time.time()
    FilterInterpolate.zero_grad()  # to clean up the gradient in the last backward

    output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())

    print("Check the forward path between CPU and GPU...", end='\t')
    x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda())
    x = torch.max(torch.abs(x))
    # print(x)

    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(output_cuda - output.cuda()))
    else:
        print("output pass", end='\n')

    # x = (flow_cuda - flow.cuda() ) * 2 / (torch.abs(flow_cuda) + torch.abs(flow).cuda() )
    # x = torch.max(torch.abs(x))
    # # print(x)
    #
    # if (x.cpu().data.numpy()[0] > 1e-6):
    #     print(x)
    # else:
    #     print("flow pass", end='\n')
    #
    print("Check the backward path between CPU and GPU...", end='\t')
    # x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda())
    # # y = x.cpu().data.numpy()
    # x = torch.max(torch.abs(x))
    # # print(x)
    #
    # if (x.cpu().data.numpy()[0] > 1e-6):
    #     print(x)
    #     print(torch.mean(input1_cuda.grad - input1.grad.cuda()))
    # else:
    #     print("pass", end='\t')

    x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda())
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))

    else:
        print("pass", end='\t')
    x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda())
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))

    else:
        print("pass", end='\n')

    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])
    return t2 - t1, t3 - t2

def test_SeparableConvModule(input1, input2, input3,filtersize):
    FilterInterpolate = SeparableConvModule(filtersize)

    t1 = time.time()

    output = FilterInterpolate(input1, input2, input3)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()

    print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)
    t1 = time.time()
    FilterInterpolate.zero_grad()  # to clean up the gradient in the last backward

    output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())

    print("Check the forward path between CPU and GPU...", end='\t')
    x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda())
    x = torch.max(torch.abs(x))
    # print(x)

    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass", end='\n')

    print("Check the backward path between CPU and GPU...", end='\t')
    x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda())
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    # print(x)

    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))
    else:
        print("pass", end='\t')
    x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda())
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))

    else:
        print("pass", end='\t')
    x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda())
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))

    else:
        print("pass", end='\n')

    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])
    return t2 - t1, t3 - t2


def test_FilterInterpolation(input1,input2,input3):
    FilterInterpolate = FilterInterpolationModule()

    t1 = time.time()

    output = FilterInterpolate(input1, input2, input3)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()

    print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True)
    t1 = time.time()
    FilterInterpolate.zero_grad()# to clean up the gradient in the last backward

    output_cuda = FilterInterpolate(input1_cuda, input2_cuda ,input3_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...", end='\t')
    x = output_cuda - output.cuda()
    x = torch.max(torch.abs(x))
    # print(x)

    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass", end='\n')

    print("Check the backward path between CPU and GPU...", end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    # print(x)

    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))
    else:
        print("pass", end='\t')
    x = input2_cuda.grad - input2.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))

    else:
        print("pass", end='\t')
    x = input3_cuda.grad - input3.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))

    else:
        print("pass", end='\n')

    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])
    return t2-t1,t3-t2


def test_InterpolationModule(input1,input2):
    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)
    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)
    # input2 = Variable(torch.zeros(12,2,64,64))
    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))
    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
    # input1.data.uniform_()
    # input2.data.uniform_(-5,5)

    Interpolate = InterpolationModule()

    t1 = time.time()

    output = Interpolate(input1,input2)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()


    print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)
    t1 = time.time()
    output_cuda = Interpolate(input1_cuda,input2_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...",end='\t')
    x = output_cuda - output.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')
    print("Check the backward path between CPU and GPU...",end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\t')
    x = input2_cuda.grad - input2.grad.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')


    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])
    return t2-t1,t3-t2

def test_InterpolationChModule(input1,input2):
    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)
    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)
    # input2 = Variable(torch.zeros(12,2,64,64))
    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))
    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
    # input1.data.uniform_()
    # input2.data.uniform_(-5,5)

    Interpolate = InterpolationChModule(input1.size(1))

    t1 = time.time()

    output = Interpolate(input1,input2)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()


    print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)
    t1 = time.time()
    output_cuda = Interpolate(input1_cuda,input2_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...",end='\t')
    x = output_cuda - output.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')
    print("Check the backward path between CPU and GPU...",end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\t')
    x = input2_cuda.grad - input2.grad.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')


    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])
    return t2-t1,t3-t2

def test_FlowProjectionModule(input1):
    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)
    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)
    # input2 = Variable(torch.zeros(12,2,64,64))
    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))
    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
    # input1.data.uniform_()
    # input2.data.uniform_(-5,5)

    Project = FlowProjectionModule()

    t1 = time.time()

    output = Project(input1)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()


    print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)
    # input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)
    Project = FlowProjectionModule() # regnenerate
    t1 = time.time()
    output_cuda = Project(input1_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...",end='\t')
    x = output_cuda - output.cuda()
    # print(output_cuda[0, 0, :10, :10])
    # print(output[0, 0, :10, :10])
    # print(x[0, 0, :10, :10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')
    print("Check the backward path between CPU and GPU...",end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    # print(input1_cuda[0,0,:10,:10])
    # print(input1[0,0,:10,:10])
    # print(x[0,0,:10,:10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))
        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))
    else:
        print("pass",end='\t')
    # x = input2_cuda.grad - input2.grad.cuda()
    # x = torch.max(torch.abs(x))
    # if(x.cpu().data.numpy()[0] > 1e-6):
    #     print(x)
    # else:
    #     print("pass",end='\n')


    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])

    print("\n\n")
    return t2-t1,t3-t2

def test_DepthFlowProjectionModule(input1,input2):
    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)
    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)
    # input2 = Variable(torch.zeros(12,2,64,64))
    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))
    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
    # input1.data.uniform_()
    # input2.data.uniform_(-5,5)

    # Project = DepthFlowProjectionModule()

    # t1 = time.time()

    # output = Project(input1,input2)
    # t2 = time.time()

    # output.backward(output.data)
    # t3 = time.time()


    # print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    # temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)
    Project = DepthFlowProjectionModule(input1_cuda.requires_grad) # regnenerate
    t1 = time.time()
    output_cuda = Project(input1_cuda,input2_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...",end='\t')
    x = output_cuda - output.cuda()
    # print(output_cuda[0, 0, :10, :10])
    # print(output[0, 0, :10, :10])
    # print(x[0, 0, :10, :10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')
    print("Check the backward path between CPU and GPU...",end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    # print(input1_cuda[0,0,:10,:10])
    # print(input1[0,0,:10,:10])
    # print(x[0,0,:10,:10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))
        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))
    else:
        print("pass",end='\t')
    x = input2_cuda.grad - input2.grad.cuda()
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')


    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])

    print("\n\n")
    return t2-t1,t3-t2

def test_WeightedFlowProjectionModule(input1 , input2, input3):
    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)
    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)
    # input2 = Variable(torch.zeros(12,2,64,64))
    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))
    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
    # input1.data.uniform_()
    # input2.data.uniform_(-5,5)

    # Project = FlowProjectionModule()
    Project = WeightedFlowProjectionModule(threshold=20.0/255.0,requires_grad=True)

    t1 = time.time()

    output = Project(input1,input2,input3)
    t2 = time.time()

    output.backward(output.data)
    t3 = time.time()


    print("CPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)
    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True)
    Project = WeightedFlowProjectionModule(threshold=20.0/255.0, requires_grad=True) # regnenerate
    t1 = time.time()
    output_cuda = Project(input1_cuda,input2_cuda,input3_cuda)
    t2 = time.time()
    output_cuda.backward(output_cuda.data)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2-t1) +"s\t" + str(t3-t2) +"s\t")
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())


    print("Check the forward path between CPU and GPU...",end='\t')
    x = output_cuda - output.cuda()
    # print(output_cuda[0, 0, :10, :10])
    # print(output[0, 0, :10, :10])
    # print(x[0, 0, :10, :10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
    else:
        print("pass",end='\n')
    print("Check the backward path between CPU and GPU...",end='\t')
    x = input1_cuda.grad - input1.grad.cuda()
    # print(input1_cuda[0,0,:10,:10])
    # print(input1[0,0,:10,:10])
    # print(x[0,0,:10,:10])
    x = torch.max(torch.abs(x))
    if(x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))
        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))
    else:
        print("pass",end='\t')
    # x = input2_cuda.grad - input2.grad.cuda()
    # x = torch.max(torch.abs(x))
    # if(x.cpu().data.numpy()[0] > 1e-6):
    #     print(x)
    # else:
    #     print("pass",end='\n')


    # print(x[0,0,...])
    # print(x[0,1,...])
    # print(x[0,2,...])
    #
    # print(torch.max(x))
    # print(x[11,2,...])

    print("\n\n")
    return t2-t1,t3-t2

def test_AdaptiveWeightInterpolationModule(input1, input2, input3, input4):
    training = True
    Interpolate = AdaptiveWeightInterpolationModule(training=training)
#gradcheck(Interpolate,)
    t1 = time.time()

    output = Interpolate(input1, input2, input3, input4)
    t2 = time.time()

    if training:
        #output.backward(output.data)
        grad = output.data
        # grad = grad.zero_()
        output.backward(grad)
        print(        input3.grad)
    t3 = time.time()

    print("CPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")

    #
    # print(output)
    # print(input1.grad.size())
    # print(input1.grad)
    # print(output[3,0,...])
    temp = input1.grad

    # input1 = input1.cuda()
    # input2 = input2.cuda()
    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)
    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)
    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)
    input4_cuda = Variable(input4.data.type(torch.cuda.FloatTensor), requires_grad=True )
    t1 = time.time()
    Interpolate.zero_grad()  # to clean up the gradient in the last backward

    output_cuda = Interpolate(input1_cuda, input2_cuda, input3_cuda,input4_cuda)
    t2 = time.time()
    if training :
#        output_cuda.backward(output_cuda.data)
        grad = output_cuda.data
#         grad = grad.zero_()
        output_cuda.backward(grad)
    t3 = time.time()
    print("GPU Forward and backward time is : " + str(t2 - t1) + "s\t" + str(t3 - t2) + "s\t")
    #    return
    # print(output_cuda)
    # print(input1_cuda.grad.size())
    # print(input1_cuda.grad)

    # print(output_cuda[3,0,...])
    # print(output[3,0,...]- output_cuda[3,0,...].cpu())

    # print(output_cuda - output.cuda())
    # print(input1_cuda.grad - input1.grad.cuda())

    print("Check the forward path between CPU and GPU...", end='\n')
    x = output_cuda - output.cuda()
    #print(x)
    #print(x>1e-6)
    print("==>total number of difference")
    print(torch.sum(torch.abs(x) > 1e-6))

    x = torch.max(torch.abs(x))
    print("==>max difference value is ")
    print(x)
    print(torch.sum(output_cuda > 1) )
    print(torch.sum(output.cuda() > 1))

    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)

    else:
        print("pass", end='\n')

    if not training:
        return t2 - t1, t3 - t2

    print("Check the backward path between CPU and GPU...", end='\t')
    y = input1_cuda.grad - input1.grad.cuda()
    x = y.cpu().data.numpy()
    #print(x>1e-6)
    x = torch.max(torch.abs(y))
    print(x)


    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))
    else:
        print("pass", end='\t')
    x = input2_cuda.grad - input2.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))

    else:
        print("pass", end='\t')
    x = input3_cuda.grad - input3.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))

    else:
        print("pass", end='\n')

    x = input4_cuda.grad - input4.grad.cuda()
    y = x.cpu().data.numpy()
    x = torch.max(torch.abs(x))
    if (x.cpu().data.numpy()[0] > 1e-6):
        print(x)
        print(torch.mean(input4_cuda.grad - input4.grad.cuda()))

    else:
        print("pass", end='\n')

    return t2 - t1, t3 - t2
#
#
# # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))
# # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))
# # B,H,W = 1,16,16
# # B,C,H,W = 2,64,32,32
# # filtersize = 4
# # input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True)
# # input2 = Variable(torch.rand(B, 2, H, W), requires_grad=True)
# # input3 = Variable(torch.rand(B, filtersize**2, H, W), requires_grad=True)
# #input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True)
# # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True)
# # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True)
# B,C,H,W = 1,3,128,128
# filtersize = 51
# input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True)
# input2 = Variable(torch.zeros(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True)
# input3 = Variable(torch.ones(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True)
#
# # input1 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3,H,W), requires_grad=True)
# # input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True)
# # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True)
# # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True)
# # input2 = Variable(torch.zeros(12,2,64,64),requires_grad = True)
# # input3 = Variable(torch.ones(12,16,64,64),requires_grad = True)
# # input2 = Variable(torch.ones(12,///2,64,64) * (-2.1))
# # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))
# input1.data.uniform_(0, 1)
# input2.data.uniform_(0, 1)
# input3.data.uniform_(0, 1) # not have to be normalized to 1.0
# # input4.data.uniform_(-1,1)
# #
# #
# # ftimes = []
# # btimes = []
# # for i in range(10):
# #     input1.data.uniform_(0, 1)
# #     input2.data.uniform_(-1, 1)
# #     input3.data.uniform_(0,1)
# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule
# #     input2 = Variable(input2.clone().data, requires_grad = True)
# #     input3 = Variable(input3.clone().data, requires_grad = True)
# #     ftime, btime = test_FilterInterpolation(input1,input2,input3)
# #     ftimes.append(ftime)
# #     btimes.append(btime)
# #
# # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n")
# # # nn.LogSoftmax
# # exit(0)
# # ftimes = []
# # btimes = []
# # for i in range(10):
# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule
# #     input2 = Variable(input2.clone().data, requires_grad = True)
# #     ftime, btime = test_InterpolationModule(input1,input2)
# #     ftimes.append(ftime)
# #     btimes.append(btime)
# #
# # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n")
# #
# # ftimes = []
# # btimes = []
# # for i in range(10):
# #     input1.data.uniform_(0, 1)
# #     input2.data.uniform_(-16, 17)
# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule
# #     input2 = Variable(input2.clone().data, requires_grad = True)
# #     ftime, btime = test_InterpolationChModule(input1,input2)
# #     ftimes.append(ftime)
# #     btimes.append(btime)
# #
# # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n")
# # # nn.LogSoftmax
# # exit(0)
# #
# ftimes = []
# btimes = []
# for i in range(3):
#     input1.data.uniform_(0.0, 1)
#     input2.data.uniform_(1.0/filtersize, 1.1/filtersize)
#     input3.data.uniform_(1.0/filtersize, 1.1/filtersize)  # not have to be normalized to 1.0
#
#     input1 = Variable(input1.clone().data, requires_grad=True)  # to delete the graph in InterpolationModule
#     input2 = Variable(input2.clone().data, requires_grad=True)
#     input3 = Variable(input3.clone().data, requires_grad=True)
#     # ftime, btime = test_SeparableConvModule(input1, input2, input3,filtersize)
#     ftime, btime = test_SeparableConvFlowModule(input1, input2, input3,filtersize)
#     ftimes.append(ftime)
#     btimes.append(btime)
# print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) + "s\t" + str(
#     numpy.array(btimes).mean()) + "s\t")
# exit(0)
#
# #
# # for i in range(10):
# #     input1.data.uniform_(0.14, 0.405)
# #     input2.data.uniform_(0.14, 0.405)
# #     input3.data.uniform_(0.2, 0.501)  # not have to be normalized to 1.0
# #     input4.data.uniform_(0.2, 0.501)
# #
# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule
# #     input2 = Variable(input2.clone().data, requires_grad = True)
# #     input3 = Variable(input3.clone().data, requires_grad = True)
# #     input4 = Variable(input4.clone().data, requires_grad = True)
# #     ftime,btime = test_AdaptiveWeightInterpolationModule(input1,input2,input3,input4)
# #     ftimes.append(ftime)
# #     btimes.append(btime)
# # print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t")
#
#
# input1 = Variable(torch.arange(0.0, 12 * 2 * 64 * 64).view(12, 2, 64, 64), requires_grad=True)
# input1.data.uniform_(-1.0,1.0)
# # input1 = Variable( - 0.5 * torch.ones(12,2,64,64).type(torch.FloatTensor), requires_grad = True)
#
#
#

B,C,H,W = 1,2,512,704
input1 = Variable(torch.arange(0.0, B*C * H * W).view(B, C, H, W), requires_grad=True)
input3 = Variable(torch.arange(0.0, B* 3 * H * W).view(B,3, H,W), requires_grad = True)
# input2 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3 ,H,W), requires_grad=True)
input2 = Variable(torch.arange(0.0, B * 1 * H * W).view(B, 1 ,H,W), requires_grad=True)


ftimes = []
btimes = []
for i in range(10):
    input1.data.uniform_(-1.0, 1.0)
    input2.data.uniform_(0.1, 1.0) # must be larger than zero
    # input3.data.uniform_(0.0, 1.0)
    input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule
    input2 = Variable(input2.clone().data, requires_grad = True)
    # ftime, btime = test_FlowProjectionModule(input1)
    ftime,btime  =test_DepthFlowProjectionModule(input1,input2)
    ftimes.append(ftime)
    btimes.append(btime)

print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n")


exit(0)


ftimes = []
btimes = []
for i in range(10):
    input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule

    input2 = Variable(input2.clone().data, requires_grad = True)
    input3 = Variable(input3.clone().data, requires_grad = True)
    ftime, btime = test_WeightedFlowProjectionModule(input1,input2,input3)
    ftimes.append(ftime)
    btimes.append(btime)

print("GPU Forward and backward time is : " + str(numpy.array(ftimes).mean()) +"s\t" + str(numpy.array(btimes).mean()) +"s\t\n\n\n\n")


================================================
FILE: networks/DAIN.py
================================================
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from my_package.FilterInterpolation import  FilterInterpolationModule
from my_package.FlowProjection import  FlowProjectionModule #,FlowFillholeModule
from my_package.DepthFlowProjection import DepthFlowProjectionModule

from Stack import Stack

import PWCNet
import S2D_models
import Resblock
import MegaDepth
import time

class DAIN(torch.nn.Module):
    def __init__(self,
                 channel = 3,
                 filter_size = 4,
                 timestep=0.5,
                 training=True):

        # base class initialization
        super(DAIN, self).__init__()
        
        self.filter_size = filter_size
        self.training = training
        self.timestep = timestep
        assert (timestep == 0.5) # TODO: or else the WeigtedFlowProjection should also be revised... Really Tedious work.
        self.numFrames =int(1.0/timestep) - 1

        i=0
        self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \
            self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, "filter")

        self.ctxNet = S2D_models.__dict__['S2DF_3dense']()
        self.ctx_ch = 3 * 64 + 3

        self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128)

        self._initialize_weights()
        
        if self.training:
            self.flownets = PWCNet.__dict__['pwc_dc_net']("PWCNet/pwc_net.pth.tar")
        else:
            self.flownets = PWCNet.__dict__['pwc_dc_net']()
        self.div_flow = 20.0

        #extract depth information
        if self.training:
            self.depthNet=MegaDepth.__dict__['HourGlass']("MegaDepth/checkpoints/test_local/best_generalization_net_G.pth")
        else:
            self.depthNet=MegaDepth.__dict__['HourGlass']()

        return

    def _initialize_weights(self):
        count = 0
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                # m.weight.data.normal_(0, math.sqrt(2. / n))
                # print(m)
                count+=1
                # print(count)
                # weight_init.xavier_uniform(m.weight.data)
                nn.init.xavier_uniform_(m.weight.data)
                # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
            # else:
            #     print(m)


    def forward(self, input):

        """
        Parameters
        ----------
        input: shape (3, batch, 3, width, height)
        -----------
        """
        losses = []
        offsets= []
        filters = []
        occlusions = []

        device = torch.cuda.current_device()
        # s1 = torch.cuda.Stream(device=device, priority=5)
        # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority
        s1 = torch.cuda.current_stream()
        s2 = torch.cuda.current_stream()

        '''
            STEP 1: sequeeze the input 
        '''
        if self.training == True:
            assert input.size(0) == 3
            input_0,input_1,input_2 = torch.squeeze(input,dim=0)
        else:
            assert input.size(0) ==2
            input_0,input_2 = torch.squeeze(input,dim=0)


        #prepare the input data of current scale
        cur_input_0 = input_0
        if self.training == True:
            cur_input_1 = input_1
        cur_input_2 =  input_2

        '''
            STEP 3.2: concatenating the inputs.
        '''
        cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1)
        cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1)

        '''
            STEP 3.3: perform the estimation by the Three subpath Network 
        '''
        time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)]

        with torch.cuda.stream(s1):
            temp  = self.depthNet(torch.cat((cur_filter_input[:, :3, ...],
                                             cur_filter_input[:, 3:, ...]),dim=0))
            log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]]

            cur_ctx_output = [
                torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]),
                       log_depth[0].detach()), dim=1),
                    torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]),
                   log_depth[1].detach()), dim=1)
                    ]
            temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter')
            cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None),
                             self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)]


            depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth]

        with torch.cuda.stream(s2):
            for _ in range(1):
                cur_offset_outputs = [
                        self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets),
                        self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...],
                                            cur_offset_input[:, 0:3, ...]), dim=1),
                                  time_offsets=time_offsets[::-1])
                        ]

        torch.cuda.synchronize() #synchronize s1 and s2

        cur_offset_outputs = [
            self.FlowProject(cur_offset_outputs[0],depth_inv[0]),
            self.FlowProject(cur_offset_outputs[1],depth_inv[1])
                ]

        '''
            STEP 3.4: perform the frame interpolation process 
        '''
        cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]]
        ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1],
                                                   cur_offset_output,cur_filter_output)

        cur_output,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output,cur_filter_output,self.filter_size**2)

        rectify_input = torch.cat((cur_output,ref0,ref2,
                                    cur_offset_output[0],cur_offset_output[1],
                                    cur_filter_output[0],cur_filter_output[1],
                                    ctx0,ctx2
        ),dim =1)
        cur_output_rectified = self.rectifyNet(rectify_input) + cur_output

        '''
            STEP 3.5: for training phase, we collect the variables to be penalized.
        '''
        if self.training == True:
                losses +=[cur_output - cur_input_1]
                losses += [cur_output_rectified - cur_input_1]                
                offsets +=[cur_offset_output]
                filters += [cur_filter_output]
        '''
            STEP 4: return the results
        '''
        if self.training == True:
            # if in the training phase, we output the losses to be minimized.
            # return losses, loss_occlusion
            return losses, offsets,filters,occlusions
        else:
            cur_outputs = [cur_output,cur_output_rectified]
            return cur_outputs,cur_offset_output,cur_filter_output

    def forward_flownets(self, model, input, time_offsets = None):

        if time_offsets == None :
            time_offsets = [0.5]
        elif type(time_offsets) == float:
            time_offsets = [time_offsets]
        elif type(time_offsets) == list:
            pass
        temp = model(input)  # this is a single direction motion results, but not a bidirectional one

        temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it.
        temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp)  for temp in temps]# nearest interpolation won't be better i think
        return temps

    '''keep this function'''
    def forward_singlePath(self, modulelist, input, name):
        stack = Stack()

        k = 0
        temp = []
        for layers in modulelist:  # self.initScaleNets_offset:
            # print(type(layers).__name__)
            # print(k)
            # if k == 27:
            #     print(k)
            #     pass
            # use the pop-pull logic, looks like a stack.
            if k == 0:
                temp = layers(input)
            else:
                # met a pooling layer, take its input
                if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d):
                    stack.push(temp)

                temp = layers(temp)

                # met a unpooling layer, take its output
                if isinstance(layers, nn.Upsample):
                    if name == 'offset':
                        temp = torch.cat((temp,stack.pop()),dim=1)  # short cut here, but optical flow should concat instead of add
                    else:
                        temp += stack.pop()  # short cut here, but optical flow should concat instead of add
            k += 1
        return temp

    '''keep this funtion'''
    def get_MonoNet5(self, channel_in, channel_out, name):

        '''
        Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion.

        :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result
        :param channel_out: number of output the offset or filter or occlusion
        :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer

        :return: output the network model
        '''
        model = []

        # block1
        model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1))
        model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.5
        # block2
        model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.4
        # block3
        model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.3
        # block4
        model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.2
        # block5
        model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2))

        # intermediate block5_5
        model += self.conv_relu(512, 512, (3, 3), (1, 1))

        # block 6
        model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2)  # THE OUTPUT No.1 UP
        # block 7
        model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2)  # THE OUTPUT No.2 UP
        # block 8
        model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2)  # THE OUTPUT No.3 UP

        # block 9
        model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2)  # THE OUTPUT No.4 UP

        # block 10
        model += self.conv_relu_unpool(32,  16, (3, 3), (1, 1), 2)  # THE OUTPUT No.5 UP

        # output our final purpose
        branch1 = []
        branch2 = []
        branch1 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))
        branch2 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))

        return  (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2))

    '''keep this function'''
    @staticmethod
    def FlowProject(inputs, depth = None):
        if depth is not None:
            outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs]
        else:
            outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs]
        return outputs


    '''keep this function'''
    @staticmethod
    def FilterInterpolate_ctx(ctx0,ctx2,offset,filter):
        ##TODO: which way should I choose

        ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach())
        ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach())

        return ctx0_offset, ctx2_offset
        # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0])
        # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1])
        #
        # return ctx0_offset, ctx2_offset
    '''Keep this function'''
    @staticmethod
    def FilterInterpolate(ref0, ref2, offset, filter,filter_size2):
        ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0])
        ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1])
        return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset

    '''keep this function'''
    @staticmethod
    def conv_relu_conv(input_filter, output_filter, kernel_size,
                        padding):

        # we actually don't need to use so much layer in the last stages.
        layers = nn.Sequential(
            nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding),
            nn.ReLU(inplace=False),
            nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding),
            # nn.ReLU(inplace=False),
            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),
            # nn.ReLU(inplace=False),
            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),
        )
        return layers


    '''keep this fucntion'''
    @staticmethod
    def conv_relu(input_filter, output_filter, kernel_size,
                        padding):
        layers = nn.Sequential(*[
            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False)
        ])
        return layers

    '''keep this function'''
    @staticmethod
    def conv_relu_maxpool(input_filter, output_filter, kernel_size,
                            padding,kernel_size_pooling):

        layers = nn.Sequential(*[
            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False),

            # nn.BatchNorm2d(output_filter),

            nn.MaxPool2d(kernel_size_pooling)
        ])
        return layers

    '''klkeep this function'''
    @staticmethod
    def conv_relu_unpool(input_filter, output_filter, kernel_size,
                            padding,unpooling_factor):

        layers = nn.Sequential(*[

            nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'),

            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False),

            # nn.BatchNorm2d(output_filter),


            # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0])
        ])
        return layers


================================================
FILE: networks/DAIN_slowmotion.py
================================================
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from my_package.FilterInterpolation import  FilterInterpolationModule
from my_package.FlowProjection import  FlowProjectionModule #,FlowFillholeModule
from my_package.DepthFlowProjection import DepthFlowProjectionModule

from Stack import Stack

import PWCNet
import S2D_models
import Resblock
import MegaDepth
import time

class DAIN_slowmotion(torch.nn.Module):
    def __init__(self,
                 channel = 3,
                 filter_size = 4,
                 timestep=0.5,
                 training=True):

        # base class initialization
        super(DAIN_slowmotion, self).__init__()
        
        self.filter_size = filter_size
        self.training = training
        self.timestep = timestep        
        self.numFrames =int(1.0/timestep) - 1
        print("Interpolate " +str( self.numFrames )+ " frames")
        i = 0
        self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \
            self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, "filter")

        self.ctxNet = S2D_models.__dict__['S2DF_3dense']()
        self.ctx_ch = 3 * 64 + 3

        self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128)

        self._initialize_weights()
        
        if self.training:
            self.flownets = PWCNet.__dict__['pwc_dc_net']("PWCNet/pwc_net.pth.tar")
        else:
            self.flownets = PWCNet.__dict__['pwc_dc_net']()
        self.div_flow = 20.0

        #extract depth information
        if self.training:
            self.depthNet=MegaDepth.__dict__['HourGlass']("MegaDepth/checkpoints/test_local/best_generalization_net_G.pth")
        else:
            self.depthNet=MegaDepth.__dict__['HourGlass']()

        return

    def _initialize_weights(self):
        count = 0
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                # m.weight.data.normal_(0, math.sqrt(2. / n))
                # print(m)
                count+=1
                # print(count)
                # weight_init.xavier_uniform(m.weight.data)
                nn.init.xavier_uniform_(m.weight.data)
                # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in')
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
            # else:
            #     print(m)


    def forward(self, input):

        """
        Parameters
        ----------
        input: shape (3, batch, 3, width, height)
        -----------
        """
        losses = []
        offsets= []
        filters = []
        occlusions = []

        device = torch.cuda.current_device()
        # s1 = torch.cuda.Stream(device=device, priority=5)
        # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority
        s1 = torch.cuda.current_stream()
        s2 = torch.cuda.current_stream()

        '''
            STEP 1: sequeeze the input 
        '''
        if self.training == True:
            assert input.size(0) == 3
            input_0,input_1,input_2 = torch.squeeze(input,dim=0)
        else:
            assert input.size(0) ==2
            input_0,input_2 = torch.squeeze(input,dim=0)


        #prepare the input data of current scale
        cur_input_0 = input_0
        if self.training == True:
            cur_input_1 = input_1
        cur_input_2 =  input_2

        '''
            STEP 3.2: concatenating the inputs.
        '''
        cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1)
        cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1)

        '''
            STEP 3.3: perform the estimation by the Three subpath Network 
        '''
        time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)]

        with torch.cuda.stream(s1):
            temp  = self.depthNet(torch.cat((cur_filter_input[:, :3, ...],
                                             cur_filter_input[:, 3:, ...]),dim=0))
            log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]]

            cur_ctx_output = [
                torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]),
                       log_depth[0].detach()), dim=1),
                    torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]),
                   log_depth[1].detach()), dim=1)
                    ]
            temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter')
            cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None),
                             self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)]


            depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth]

        with torch.cuda.stream(s2):
            for _ in range(1):
                cur_offset_outputs = [
                        self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets),
                        self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...],
                                            cur_offset_input[:, 0:3, ...]), dim=1),
                                  time_offsets=[1 - t for t in time_offsets])
                        ]

        torch.cuda.synchronize() #synchronize s1 and s2

        cur_offset_outputs = [
            self.FlowProject(cur_offset_outputs[0],depth_inv[0]),
            self.FlowProject(cur_offset_outputs[1],depth_inv[1])
                ]

        '''
            STEP 3.4: perform the frame interpolation process 
        '''
        cur_output_rectified = []
        cur_output = []
        
        for temp_0,temp_1, timeoffset in zip(cur_offset_outputs[0], cur_offset_outputs[1], time_offsets):
            cur_offset_output = [temp_0,temp_1] #[cur_offset_outputs[0][0], cur_offset_outputs[1][0]]
            ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1],
                               cur_offset_output,cur_filter_output, timeoffset)


            cur_output_temp ,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output,
                                          cur_filter_output,self.filter_size**2, timeoffset)
            cur_output.append(cur_output_temp)

            rectify_input = torch.cat((cur_output_temp,ref0,ref2,
                                        cur_offset_output[0],cur_offset_output[1],
                                        cur_filter_output[0],cur_filter_output[1],
                                        ctx0,ctx2
                                        ),dim =1)
            cur_output_rectified_temp = self.rectifyNet(rectify_input) + cur_output_temp
            cur_output_rectified.append(cur_output_rectified_temp)

        '''
            STEP 3.5: for training phase, we collect the variables to be penalized.
        '''
        if self.training == True:
                losses +=[cur_output - cur_input_1]
                losses += [cur_output_rectified - cur_input_1]                
                offsets +=[cur_offset_output]
                filters += [cur_filter_output]
        '''
            STEP 4: return the results
        '''
        if self.training == True:
            # if in the training phase, we output the losses to be minimized.
            # return losses, loss_occlusion
            return losses, offsets,filters,occlusions
        else:
            cur_outputs = [cur_output,cur_output_rectified]
            return cur_outputs,cur_offset_output,cur_filter_output

    def forward_flownets(self, model, input, time_offsets = None):

        if time_offsets == None :
            time_offsets = [0.5]
        elif type(time_offsets) == float:
            time_offsets = [time_offsets]
        elif type(time_offsets) == list:
            pass
        temp = model(input)  # this is a single direction motion results, but not a bidirectional one

        temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it.
        temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp)  for temp in temps]# nearest interpolation won't be better i think
        return temps

    '''keep this function'''
    def forward_singlePath(self, modulelist, input, name):
        stack = Stack()

        k = 0
        temp = []
        for layers in modulelist:  # self.initScaleNets_offset:
            # print(type(layers).__name__)
            # print(k)
            # if k == 27:
            #     print(k)
            #     pass
            # use the pop-pull logic, looks like a stack.
            if k == 0:
                temp = layers(input)
            else:
                # met a pooling layer, take its input
                if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d):
                    stack.push(temp)

                temp = layers(temp)

                # met a unpooling layer, take its output
                if isinstance(layers, nn.Upsample):
                    if name == 'offset':
                        temp = torch.cat((temp,stack.pop()),dim=1)  # short cut here, but optical flow should concat instead of add
                    else:
                        temp += stack.pop()  # short cut here, but optical flow should concat instead of add
            k += 1
        return temp

    '''keep this funtion'''
    def get_MonoNet5(self, channel_in, channel_out, name):

        '''
        Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion.

        :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result
        :param channel_out: number of output the offset or filter or occlusion
        :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer

        :return: output the network model
        '''
        model = []

        # block1
        model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1))
        model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.5
        # block2
        model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.4
        # block3
        model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.3
        # block4
        model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.2
        # block5
        model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2))

        # intermediate block5_5
        model += self.conv_relu(512, 512, (3, 3), (1, 1))

        # block 6
        model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2)  # THE OUTPUT No.1 UP
        # block 7
        model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2)  # THE OUTPUT No.2 UP
        # block 8
        model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2)  # THE OUTPUT No.3 UP

        # block 9
        model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2)  # THE OUTPUT No.4 UP

        # block 10
        model += self.conv_relu_unpool(32,  16, (3, 3), (1, 1), 2)  # THE OUTPUT No.5 UP

        # output our final purpose
        branch1 = []
        branch2 = []
        branch1 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))
        branch2 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))

        return  (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2))

    '''keep this function'''
    @staticmethod
    def FlowProject(inputs, depth = None):
        if depth is not None:
            outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs]
        else:
            outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs]
        return outputs


    '''keep this function'''
    @staticmethod
    def FilterInterpolate_ctx(ctx0,ctx2,offset,filter, timeoffset):
        ##TODO: which way should I choose

        ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach())
        ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach())

        return ctx0_offset, ctx2_offset
        # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0])
        # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1])
        #
        # return ctx0_offset, ctx2_offset
    '''Keep this function'''
    @staticmethod
    def FilterInterpolate(ref0, ref2, offset, filter,filter_size2, time_offset):
        ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0])
        ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1])

        # occlusion0, occlusion2 = torch.split(occlusion, 1, dim=1)
        # print((occlusion0[0,0,1,1] + occlusion2[0,0,1,1]))
        # output = (occlusion0 * ref0_offset + occlusion2 * ref2_offset) / (occlusion0 + occlusion2)
        # output = * ref0_offset + occlusion[1] * ref2_offset
        # automatically broadcasting the occlusion to the three channels of and image.
        # return output
        # return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset
        return ref0_offset*(1.0 - time_offset) + ref2_offset*(time_offset), ref0_offset, ref2_offset

    '''keep this function'''
    @staticmethod
    def conv_relu_conv(input_filter, output_filter, kernel_size,
                        padding):

        # we actually don't need to use so much layer in the last stages.
        layers = nn.Sequential(
            nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding),
            nn.ReLU(inplace=False),
            nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding),
            # nn.ReLU(inplace=False),
            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),
            # nn.ReLU(inplace=False),
            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),
        )
        return layers


    '''keep this fucntion'''
    @staticmethod
    def conv_relu(input_filter, output_filter, kernel_size,
                        padding):
        layers = nn.Sequential(*[
            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False)
        ])
        return layers

    '''keep this function'''
    @staticmethod
    def conv_relu_maxpool(input_filter, output_filter, kernel_size,
                            padding,kernel_size_pooling):

        layers = nn.Sequential(*[
            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False),

            # nn.BatchNorm2d(output_filter),

            nn.MaxPool2d(kernel_size_pooling)
        ])
        return layers

    '''klkeep this function'''
    @staticmethod
    def conv_relu_unpool(input_filter, output_filter, kernel_size,
                            padding,unpooling_factor):

        layers = nn.Sequential(*[

            nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'),

            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),

            nn.ReLU(inplace=False),

            # nn.BatchNorm2d(output_filter),


            # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0])
        ])
        return layers

================================================
FILE: networks/__init__.py
================================================
from .DAIN import DAIN
from .DAIN_slowmotion import DAIN_slowmotion
__all__ = (
           'DAIN',
           'DAIN_slowmotion'
)


================================================
FILE: train.py
================================================
import sys
import os

import threading
import torch
from torch.autograd import Variable
import torch.utils.data
from lr_scheduler import *

import numpy
from AverageMeter import  *
from loss_function import *
import datasets
import balancedsampler
import networks
from my_args import args


def train():
    torch.manual_seed(args.seed)

    model = networks.__dict__[args.netName](channel=args.channels,
                            filter_size = args.filter_size ,
                            timestep=args.time_step,
                            training=True)
    if args.use_cuda:
        print("Turn the model into CUDA")
        model = model.cuda()

    if not args.SAVED_MODEL==None:
        # args.SAVED_MODEL ='../model_weights/'+ args.SAVED_MODEL + "/best" + ".pth"
        args.SAVED_MODEL ='./model_weights/best.pth'
        print("Fine tuning on " +  args.SAVED_MODEL)
        if not  args.use_cuda:
            pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)
            # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))
        else:
            pretrained_dict = torch.load(args.SAVED_MODEL)
            # model.load_state_dict(torch.load(args.SAVED_MODEL))
        #print([k for k,v in      pretrained_dict.items()])

        model_dict = model.state_dict()
        # 1. filter out unnecessary keys
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)
        # 3. load the new state dict
        model.load_state_dict(model_dict)
        pretrained_dict = None

    if type(args.datasetName) == list:
        train_sets, test_sets = [],[]
        for ii, jj in zip(args.datasetName, args.datasetPath):
            tr_s, te_s = datasets.__dict__[ii](jj, split = args.dataset_split,single = args.single_output, task = args.task)
            train_sets.append(tr_s)
            test_sets.append(te_s)
        train_set = torch.utils.data.ConcatDataset(train_sets)
        test_set = torch.utils.data.ConcatDataset(test_sets)
    else:
        train_set, test_set = datasets.__dict__[args.datasetName](args.datasetPath)
    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size = args.batch_size,
        sampler=balancedsampler.RandomBalancedSampler(train_set, int(len(train_set) / args.batch_size )),
        num_workers= args.workers, pin_memory=True if args.use_cuda else False)

    val_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size,
                                             num_workers=args.workers, pin_memory=True if args.use_cuda else False)
    print('{} samples found, {} train samples and {} test samples '.format(len(test_set)+len(train_set),
                                                                           len(train_set),
                                                                           len(test_set)))


    # if not args.lr == 0:
    print("train the interpolation net")
    optimizer = torch.optim.Adamax([
                {'params': model.initScaleNets_filter.parameters(), 'lr': args.filter_lr_coe * args.lr},
                {'params': model.initScaleNets_filter1.parameters(), 'lr': args.filter_lr_coe * args.lr},
                {'params': model.initScaleNets_filter2.parameters(), 'lr': args.filter_lr_coe * args.lr},
                {'params': model.ctxNet.parameters(), 'lr': args.ctx_lr_coe * args.lr},
                {'params': model.flownets.parameters(), 'lr': args.flow_lr_coe * args.lr},
                {'params': model.depthNet.parameters(), 'lr': args.depth_lr_coe * args.lr},
                {'params': model.rectifyNet.parameters(), 'lr': args.rectify_lr}
            ],
                lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay)


    scheduler = ReduceLROnPlateau(optimizer, 'min',factor=args.factor, patience=args.patience,verbose=True)

    print("*********Start Training********")
    print("LR is: "+ str(float(optimizer.param_groups[0]['lr'])))
    print("EPOCH is: "+ str(int(len(train_set) / args.batch_size )))
    print("Num of EPOCH is: "+ str(args.numEpoch))
    def count_network_parameters(model):

        parameters = filter(lambda p: p.requires_grad, model.parameters())
        N = sum([numpy.prod(p.size()) for p in parameters])

        return N
    print("Num. of model parameters is :" + str(count_network_parameters(model)))
    if hasattr(model,'flownets'):
        print("Num. of flow model parameters is :" +
              str(count_network_parameters(model.flownets)))
    if hasattr(model,'initScaleNets_occlusion'):
        print("Num. of initScaleNets_occlusion model parameters is :" +
              str(count_network_parameters(model.initScaleNets_occlusion) +
                  count_network_parameters(model.initScaleNets_occlusion1) +
        count_network_parameters(model.initScaleNets_occlusion2)))
    if hasattr(model,'initScaleNets_filter'):
        print("Num. of initScaleNets_filter model parameters is :" +
              str(count_network_parameters(model.initScaleNets_filter) +
                  count_network_parameters(model.initScaleNets_filter1) +
        count_network_parameters(model.initScaleNets_filter2)))
    if hasattr(model, 'ctxNet'):
        print("Num. of ctxNet model parameters is :" +
              str(count_network_parameters(model.ctxNet)))
    if hasattr(model, 'depthNet'):
        print("Num. of depthNet model parameters is :" +
              str(count_network_parameters(model.depthNet)))
    if hasattr(model,'rectifyNet'):
        print("Num. of rectifyNet model parameters is :" +
              str(count_network_parameters(model.rectifyNet)))

    training_losses = AverageMeter()
    auxiliary_data = []
    saved_total_loss = 10e10
    saved_total_PSNR = -1
    ikk = 0
    for kk in optimizer.param_groups:
        if kk['lr'] > 0:
            ikk = kk
            break

    for t in range(args.numEpoch):
        print("The id of this in-training network is " + str(args.uid))
        print(args)
        #Turn into training mode
        model = model.train()

        for i, (X0_half,X1_half, y_half) in enumerate(train_loader):

            if i >= int(len(train_set) / args.batch_size ):
                #(0 if t == 0 else EPOCH):#
                break

            X0_half = X0_half.cuda() if args.use_cuda else X0_half
            X1_half = X1_half.cuda() if args.use_cuda else X1_half
            y_half = y_half.cuda() if args.use_cuda else y_half

            X0 = Variable(X0_half, requires_grad= False)
            X1 = Variable(X1_half, requires_grad= False)
            y  = Variable(y_half,requires_grad= False)

            diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0))

            pixel_loss, offset_loss, sym_loss = part_loss(diffs,offsets,occlusions, [X0,X1],epsilon=args.epsilon)

            total_loss = sum(x*y if x > 0 else 0 for x,y in zip(args.alpha, pixel_loss))

            training_losses.update(total_loss.item(), args.batch_size)
            if i % max(1, int(int(len(train_set) / args.batch_size )/500.0)) == 0:

                print("Ep [" + str(t) +"/" + str(i) +
                                    "]\tl.r.: " + str(round(float(ikk['lr']),7))+
                                    "\tPix: " + str([round(x.item(),5) for x in pixel_loss]) +
                                    "\tTV: " + str([round(x.item(),4)  for x in offset_loss]) +
                                    "\tSym: " + str([round(x.item(), 4) for x in sym_loss]) +
                                    "\tTotal: " + str([round(x.item(),5) for x in [total_loss]]) +
                                    "\tAvg. Loss: " + str([round(training_losses.avg, 5)]))

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

        if t == 1:
            # delete the pre validation weights for cleaner workspace
            if os.path.exists(args.save_path + "/epoch" + str(0) +".pth" ):
                os.remove(args.save_path + "/epoch" + str(0) +".pth")

        if os.path.exists(args.save_path + "/epoch" + str(t-1) +".pth"):
            os.remove(args.save_path + "/epoch" + str(t-1) +".pth")
        torch.save(model.state_dict(), args.save_path + "/epoch" + str(t) +".pth")

        # print("\t\t**************Start Validation*****************")
        #Turn into evaluation mode

        val_total_losses = AverageMeter()
        val_total_pixel_loss = AverageMeter()
        val_total_PSNR_loss = AverageMeter()
        val_total_tv_loss = AverageMeter()
        val_total_pws_loss = AverageMeter()
        val_total_sym_loss = AverageMeter()

        for i, (X0,X1,y) in enumerate(val_loader):
            if i >=  int(len(test_set)/ args.batch_size):
                break

            with torch.no_grad():
                X0 = X0.cuda() if args.use_cuda else X0
                X1 = X1.cuda() if args.use_cuda else X1
                y = y.cuda() if args.use_cuda else y

                diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0))

                pixel_loss, offset_loss,sym_loss = part_loss(diffs, offsets, occlusions, [X0,X1],epsilon=args.epsilon)

                val_total_loss = sum(x * y for x, y in zip(args.alpha, pixel_loss))

                per_sample_pix_error = torch.mean(torch.mean(torch.mean(diffs[args.save_which] ** 2,
                                                                    dim=1),dim=1),dim=1)
                per_sample_pix_error = per_sample_pix_error.data # extract tensor
                psnr_loss = torch.mean(20 * torch.log(1.0/torch.sqrt(per_sample_pix_error)))/torch.log(torch.Tensor([10]))
                #

                val_total_losses.update(val_total_loss.item(),args.batch_size)
                val_total_pixel_loss.update(pixel_loss[args.save_which].item(), args.batch_size)
                val_total_tv_loss.update(offset_loss[0].item(), args.batch_size)
                val_total_sym_loss.update(sym_loss[0].item(), args.batch_size)
                val_total_PSNR_loss.update(psnr_loss[0],args.batch_size)
                print(".",end='',flush=True)

        print("\nEpoch " + str(int(t)) +
              "\tlearning rate: " + str(float(ikk['lr'])) +
              "\tAvg Training Loss: " + str(round(training_losses.avg,5)) +
              "\tValidate Loss: " + str([round(float(val_total_losses.avg), 5)]) +
              "\tValidate PSNR: " + str([round(float(val_total_PSNR_loss.avg), 5)]) +
              "\tPixel Loss: " + str([round(float(val_total_pixel_loss.avg), 5)]) +
              "\tTV Loss: " + str([round(float(val_total_tv_loss.avg), 4)]) +
              "\tPWS Loss: " + str([round(float(val_total_pws_loss.avg), 4)]) +
              "\tSym Loss: " + str([round(float(val_total_sym_loss.avg), 4)])
              )

        auxiliary_data.append([t, float(ikk['lr']),
                                   training_losses.avg, val_total_losses.avg, val_total_pixel_loss.avg,
                                   val_total_tv_loss.avg,val_total_pws_loss.avg,val_total_sym_loss.avg])

        numpy.savetxt(args.log, numpy.array(auxiliary_data), fmt='%.8f', delimiter=',')
        training_losses.reset()

        print("\t\tFinished an epoch, Check and Save the model weights")
            # we check the validation loss instead of training loss. OK~
        if saved_total_loss >= val_total_losses.avg:
            saved_total_loss = val_total_losses.avg
            torch.save(model.state_dict(), args.save_path + "/best"+".pth")
            print("\t\tBest Weights updated for decreased validation loss\n")

        else:
            print("\t\tWeights Not updated for undecreased validation loss\n")

        #schdule the learning rate
        scheduler.step(val_total_losses.avg)


    print("*********Finish Training********")

if __name__ == '__main__':
    sys.setrecursionlimit(100000)# 0xC00000FD exception for the recursive detach of gradients.
    threading.stack_size(200000000)# 0xC00000FD exception for the recursive detach of gradients.
    thread = threading.Thread(target=train)
    thread.start()
    thread.join()

    exit(0)